LCOV - code coverage report
Current view: top level - src/backend/storage/ipc - latch.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 229 273 83.9 %
Date: 2017-09-29 13:40:31 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * latch.c
       4             :  *    Routines for inter-process latches
       5             :  *
       6             :  * The Unix implementation uses the so-called self-pipe trick to overcome the
       7             :  * race condition involved with poll() (or epoll_wait() on linux) and setting
       8             :  * a global flag in the signal handler. When a latch is set and the current
       9             :  * process is waiting for it, the signal handler wakes up the poll() in
      10             :  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
      11             :  * poll() on all platforms, and even on platforms where it does, a signal that
      12             :  * arrives just before the poll() call does not prevent poll() from entering
      13             :  * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
      14             :  * and causes poll() to return immediately even if the signal arrives before
      15             :  * poll() begins.
      16             :  *
      17             :  * When SetLatch is called from the same process that owns the latch,
      18             :  * SetLatch writes the byte directly to the pipe. If it's owned by another
      19             :  * process, SIGUSR1 is sent and the signal handler in the waiting process
      20             :  * writes the byte to the pipe on behalf of the signaling process.
      21             :  *
      22             :  * The Windows implementation uses Windows events that are inherited by all
      23             :  * postmaster child processes. There's no need for the self-pipe trick there.
      24             :  *
      25             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
      26             :  * Portions Copyright (c) 1994, Regents of the University of California
      27             :  *
      28             :  * IDENTIFICATION
      29             :  *    src/backend/storage/ipc/latch.c
      30             :  *
      31             :  *-------------------------------------------------------------------------
      32             :  */
      33             : #include "postgres.h"
      34             : 
      35             : #include <fcntl.h>
      36             : #include <limits.h>
      37             : #include <signal.h>
      38             : #include <unistd.h>
      39             : #ifdef HAVE_SYS_EPOLL_H
      40             : #include <sys/epoll.h>
      41             : #endif
      42             : #ifdef HAVE_POLL_H
      43             : #include <poll.h>
      44             : #endif
      45             : 
      46             : #include "miscadmin.h"
      47             : #include "pgstat.h"
      48             : #include "port/atomics.h"
      49             : #include "portability/instr_time.h"
      50             : #include "postmaster/postmaster.h"
      51             : #include "storage/latch.h"
      52             : #include "storage/pmsignal.h"
      53             : #include "storage/shmem.h"
      54             : 
      55             : /*
      56             :  * Select the fd readiness primitive to use. Normally the "most modern"
      57             :  * primitive supported by the OS will be used, but for testing it can be
      58             :  * useful to manually specify the used primitive.  If desired, just add a
      59             :  * define somewhere before this block.
      60             :  */
      61             : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
      62             :     defined(WAIT_USE_WIN32)
      63             : /* don't overwrite manual choice */
      64             : #elif defined(HAVE_SYS_EPOLL_H)
      65             : #define WAIT_USE_EPOLL
      66             : #elif defined(HAVE_POLL)
      67             : #define WAIT_USE_POLL
      68             : #elif WIN32
      69             : #define WAIT_USE_WIN32
      70             : #else
      71             : #error "no wait set implementation available"
      72             : #endif
      73             : 
      74             : /* typedef in latch.h */
      75             : struct WaitEventSet
      76             : {
      77             :     int         nevents;        /* number of registered events */
      78             :     int         nevents_space;  /* maximum number of events in this set */
      79             : 
      80             :     /*
      81             :      * Array, of nevents_space length, storing the definition of events this
      82             :      * set is waiting for.
      83             :      */
      84             :     WaitEvent  *events;
      85             : 
      86             :     /*
      87             :      * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
      88             :      * said latch, and latch_pos the offset in the ->events array. This is
      89             :      * useful because we check the state of the latch before performing doing
      90             :      * syscalls related to waiting.
      91             :      */
      92             :     Latch      *latch;
      93             :     int         latch_pos;
      94             : 
      95             : #if defined(WAIT_USE_EPOLL)
      96             :     int         epoll_fd;
      97             :     /* epoll_wait returns events in a user provided arrays, allocate once */
      98             :     struct epoll_event *epoll_ret_events;
      99             : #elif defined(WAIT_USE_POLL)
     100             :     /* poll expects events to be waited on every poll() call, prepare once */
     101             :     struct pollfd *pollfds;
     102             : #elif defined(WAIT_USE_WIN32)
     103             : 
     104             :     /*
     105             :      * Array of windows events. The first element always contains
     106             :      * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
     107             :      * event->pos + 1).
     108             :      */
     109             :     HANDLE     *handles;
     110             : #endif
     111             : };
     112             : 
     113             : #ifndef WIN32
     114             : /* Are we currently in WaitLatch? The signal handler would like to know. */
     115             : static volatile sig_atomic_t waiting = false;
     116             : 
     117             : /* Read and write ends of the self-pipe */
     118             : static int  selfpipe_readfd = -1;
     119             : static int  selfpipe_writefd = -1;
     120             : 
     121             : /* Process owning the self-pipe --- needed for checking purposes */
     122             : static int  selfpipe_owner_pid = 0;
     123             : 
     124             : /* Private function prototypes */
     125             : static void sendSelfPipeByte(void);
     126             : static void drainSelfPipe(void);
     127             : #endif                          /* WIN32 */
     128             : 
     129             : #if defined(WAIT_USE_EPOLL)
     130             : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
     131             : #elif defined(WAIT_USE_POLL)
     132             : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
     133             : #elif defined(WAIT_USE_WIN32)
     134             : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
     135             : #endif
     136             : 
     137             : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
     138             :                       WaitEvent *occurred_events, int nevents);
     139             : 
     140             : /*
     141             :  * Initialize the process-local latch infrastructure.
     142             :  *
     143             :  * This must be called once during startup of any process that can wait on
     144             :  * latches, before it issues any InitLatch() or OwnLatch() calls.
     145             :  */
     146             : void
     147         345 : InitializeLatchSupport(void)
     148             : {
     149             : #ifndef WIN32
     150             :     int         pipefd[2];
     151             : 
     152         345 :     if (IsUnderPostmaster)
     153             :     {
     154             :         /*
     155             :          * We might have inherited connections to a self-pipe created by the
     156             :          * postmaster.  It's critical that child processes create their own
     157             :          * self-pipes, of course, and we really want them to close the
     158             :          * inherited FDs for safety's sake.
     159             :          */
     160         341 :         if (selfpipe_owner_pid != 0)
     161             :         {
     162             :             /* Assert we go through here but once in a child process */
     163           0 :             Assert(selfpipe_owner_pid != MyProcPid);
     164             :             /* Release postmaster's pipe FDs; ignore any error */
     165           0 :             (void) close(selfpipe_readfd);
     166           0 :             (void) close(selfpipe_writefd);
     167             :             /* Clean up, just for safety's sake; we'll set these below */
     168           0 :             selfpipe_readfd = selfpipe_writefd = -1;
     169           0 :             selfpipe_owner_pid = 0;
     170             :         }
     171             :         else
     172             :         {
     173             :             /*
     174             :              * Postmaster didn't create a self-pipe ... or else we're in an
     175             :              * EXEC_BACKEND build, in which case it doesn't matter since the
     176             :              * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
     177             :              */
     178         341 :             Assert(selfpipe_readfd == -1);
     179             :         }
     180             :     }
     181             :     else
     182             :     {
     183             :         /* In postmaster or standalone backend, assert we do this but once */
     184           4 :         Assert(selfpipe_readfd == -1);
     185           4 :         Assert(selfpipe_owner_pid == 0);
     186             :     }
     187             : 
     188             :     /*
     189             :      * Set up the self-pipe that allows a signal handler to wake up the
     190             :      * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
     191             :      * that SetLatch won't block if the event has already been set many times
     192             :      * filling the kernel buffer. Make the read-end non-blocking too, so that
     193             :      * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
     194             :      * Also, make both FDs close-on-exec, since we surely do not want any
     195             :      * child processes messing with them.
     196             :      */
     197         345 :     if (pipe(pipefd) < 0)
     198           0 :         elog(FATAL, "pipe() failed: %m");
     199         345 :     if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
     200           0 :         elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
     201         345 :     if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
     202           0 :         elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
     203         345 :     if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
     204           0 :         elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
     205         345 :     if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
     206           0 :         elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
     207             : 
     208         345 :     selfpipe_readfd = pipefd[0];
     209         345 :     selfpipe_writefd = pipefd[1];
     210         345 :     selfpipe_owner_pid = MyProcPid;
     211             : #else
     212             :     /* currently, nothing to do here for Windows */
     213             : #endif
     214         345 : }
     215             : 
     216             : /*
     217             :  * Initialize a process-local latch.
     218             :  */
     219             : void
     220         345 : InitLatch(volatile Latch *latch)
     221             : {
     222         345 :     latch->is_set = false;
     223         345 :     latch->owner_pid = MyProcPid;
     224         345 :     latch->is_shared = false;
     225             : 
     226             : #ifndef WIN32
     227             :     /* Assert InitializeLatchSupport has been called in this process */
     228         345 :     Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
     229             : #else
     230             :     latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
     231             :     if (latch->event == NULL)
     232             :         elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
     233             : #endif                          /* WIN32 */
     234         345 : }
     235             : 
     236             : /*
     237             :  * Initialize a shared latch that can be set from other processes. The latch
     238             :  * is initially owned by no-one; use OwnLatch to associate it with the
     239             :  * current process.
     240             :  *
     241             :  * InitSharedLatch needs to be called in postmaster before forking child
     242             :  * processes, usually right after allocating the shared memory block
     243             :  * containing the latch with ShmemInitStruct. (The Unix implementation
     244             :  * doesn't actually require that, but the Windows one does.) Because of
     245             :  * this restriction, we have no concurrency issues to worry about here.
     246             :  *
     247             :  * Note that other handles created in this module are never marked as
     248             :  * inheritable.  Thus we do not need to worry about cleaning up child
     249             :  * process references to postmaster-private latches or WaitEventSets.
     250             :  */
     251             : void
     252         585 : InitSharedLatch(volatile Latch *latch)
     253             : {
     254             : #ifdef WIN32
     255             :     SECURITY_ATTRIBUTES sa;
     256             : 
     257             :     /*
     258             :      * Set up security attributes to specify that the events are inherited.
     259             :      */
     260             :     ZeroMemory(&sa, sizeof(sa));
     261             :     sa.nLength = sizeof(sa);
     262             :     sa.bInheritHandle = TRUE;
     263             : 
     264             :     latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
     265             :     if (latch->event == NULL)
     266             :         elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
     267             : #endif
     268             : 
     269         585 :     latch->is_set = false;
     270         585 :     latch->owner_pid = 0;
     271         585 :     latch->is_shared = true;
     272         585 : }
     273             : 
     274             : /*
     275             :  * Associate a shared latch with the current process, allowing it to
     276             :  * wait on the latch.
     277             :  *
     278             :  * Although there is a sanity check for latch-already-owned, we don't do
     279             :  * any sort of locking here, meaning that we could fail to detect the error
     280             :  * if two processes try to own the same latch at about the same time.  If
     281             :  * there is any risk of that, caller must provide an interlock to prevent it.
     282             :  *
     283             :  * In any process that calls OwnLatch(), make sure that
     284             :  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
     285             :  * as shared latches use SIGUSR1 for inter-process communication.
     286             :  */
     287             : void
     288         342 : OwnLatch(volatile Latch *latch)
     289             : {
     290             :     /* Sanity checks */
     291         342 :     Assert(latch->is_shared);
     292             : 
     293             : #ifndef WIN32
     294             :     /* Assert InitializeLatchSupport has been called in this process */
     295         342 :     Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
     296             : #endif
     297             : 
     298         342 :     if (latch->owner_pid != 0)
     299           0 :         elog(ERROR, "latch already owned");
     300             : 
     301         342 :     latch->owner_pid = MyProcPid;
     302         342 : }
     303             : 
     304             : /*
     305             :  * Disown a shared latch currently owned by the current process.
     306             :  */
     307             : void
     308         342 : DisownLatch(volatile Latch *latch)
     309             : {
     310         342 :     Assert(latch->is_shared);
     311         342 :     Assert(latch->owner_pid == MyProcPid);
     312             : 
     313         342 :     latch->owner_pid = 0;
     314         342 : }
     315             : 
     316             : /*
     317             :  * Wait for a given latch to be set, or for postmaster death, or until timeout
     318             :  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
     319             :  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
     320             :  * function returns immediately.
     321             :  *
     322             :  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
     323             :  * is given.  Although it is declared as "long", we don't actually support
     324             :  * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
     325             :  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
     326             :  *
     327             :  * The latch must be owned by the current process, ie. it must be a
     328             :  * process-local latch initialized with InitLatch, or a shared latch
     329             :  * associated with the current process by calling OwnLatch.
     330             :  *
     331             :  * Returns bit mask indicating which condition(s) caused the wake-up. Note
     332             :  * that if multiple wake-up conditions are true, there is no guarantee that
     333             :  * we return all of them in one call, but we will return at least one.
     334             :  */
     335             : int
     336        2524 : WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
     337             :           uint32 wait_event_info)
     338             : {
     339        2524 :     return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
     340             :                              wait_event_info);
     341             : }
     342             : 
     343             : /*
     344             :  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
     345             :  * conditions.
     346             :  *
     347             :  * When waiting on a socket, EOF and error conditions always cause the socket
     348             :  * to be reported as readable/writable/connected, so that the caller can deal
     349             :  * with the condition.
     350             :  *
     351             :  * NB: These days this is just a wrapper around the WaitEventSet API. When
     352             :  * using a latch very frequently, consider creating a longer living
     353             :  * WaitEventSet instead; that's more efficient.
     354             :  */
     355             : int
     356        4734 : WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
     357             :                   long timeout, uint32 wait_event_info)
     358             : {
     359        4734 :     int         ret = 0;
     360             :     int         rc;
     361             :     WaitEvent   event;
     362        4734 :     WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
     363             : 
     364        4734 :     if (wakeEvents & WL_TIMEOUT)
     365        1578 :         Assert(timeout >= 0);
     366             :     else
     367        3156 :         timeout = -1;
     368             : 
     369        4734 :     if (wakeEvents & WL_LATCH_SET)
     370        4734 :         AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
     371             :                           (Latch *) latch, NULL);
     372             : 
     373        4734 :     if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)
     374        3858 :         AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
     375             :                           NULL, NULL);
     376             : 
     377        4734 :     if (wakeEvents & WL_SOCKET_MASK)
     378             :     {
     379             :         int         ev;
     380             : 
     381        2210 :         ev = wakeEvents & WL_SOCKET_MASK;
     382        2210 :         AddWaitEventToSet(set, ev, sock, NULL, NULL);
     383             :     }
     384             : 
     385        4734 :     rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
     386             : 
     387        4734 :     if (rc == 0)
     388         909 :         ret |= WL_TIMEOUT;
     389             :     else
     390             :     {
     391        3825 :         ret |= event.events & (WL_LATCH_SET |
     392             :                                WL_POSTMASTER_DEATH |
     393             :                                WL_SOCKET_MASK);
     394             :     }
     395             : 
     396        4734 :     FreeWaitEventSet(set);
     397             : 
     398        4734 :     return ret;
     399             : }
     400             : 
     401             : /*
     402             :  * Sets a latch and wakes up anyone waiting on it.
     403             :  *
     404             :  * This is cheap if the latch is already set, otherwise not so much.
     405             :  *
     406             :  * NB: when calling this in a signal handler, be sure to save and restore
     407             :  * errno around it.  (That's standard practice in most signal handlers, of
     408             :  * course, but we used to omit it in handlers that only set a flag.)
     409             :  *
     410             :  * NB: this function is called from critical sections and signal handlers so
     411             :  * throwing an error is not a good idea.
     412             :  */
     413             : void
     414        4125 : SetLatch(volatile Latch *latch)
     415             : {
     416             : #ifndef WIN32
     417             :     pid_t       owner_pid;
     418             : #else
     419             :     HANDLE      handle;
     420             : #endif
     421             : 
     422             :     /*
     423             :      * The memory barrier has to be placed here to ensure that any flag
     424             :      * variables possibly changed by this process have been flushed to main
     425             :      * memory, before we check/set is_set.
     426             :      */
     427        4125 :     pg_memory_barrier();
     428             : 
     429             :     /* Quick exit if already set */
     430        4125 :     if (latch->is_set)
     431        1875 :         return;
     432             : 
     433        2250 :     latch->is_set = true;
     434             : 
     435             : #ifndef WIN32
     436             : 
     437             :     /*
     438             :      * See if anyone's waiting for the latch. It can be the current process if
     439             :      * we're in a signal handler. We use the self-pipe to wake up the
     440             :      * poll()/epoll_wait() in that case. If it's another process, send a
     441             :      * signal.
     442             :      *
     443             :      * Fetch owner_pid only once, in case the latch is concurrently getting
     444             :      * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
     445             :      * guaranteed to be true! In practice, the effective range of pid_t fits
     446             :      * in a 32 bit integer, and so should be atomic. In the worst case, we
     447             :      * might end up signaling the wrong process. Even then, you're very
     448             :      * unlucky if a process with that bogus pid exists and belongs to
     449             :      * Postgres; and PG database processes should handle excess SIGUSR1
     450             :      * interrupts without a problem anyhow.
     451             :      *
     452             :      * Another sort of race condition that's possible here is for a new
     453             :      * process to own the latch immediately after we look, so we don't signal
     454             :      * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
     455             :      * the standard coding convention of waiting at the bottom of their loops,
     456             :      * not the top, so that they'll correctly process latch-setting events
     457             :      * that happen before they enter the loop.
     458             :      */
     459        2250 :     owner_pid = latch->owner_pid;
     460        2250 :     if (owner_pid == 0)
     461           0 :         return;
     462        2250 :     else if (owner_pid == MyProcPid)
     463             :     {
     464        1092 :         if (waiting)
     465         462 :             sendSelfPipeByte();
     466             :     }
     467             :     else
     468        1158 :         kill(owner_pid, SIGUSR1);
     469             : #else
     470             : 
     471             :     /*
     472             :      * See if anyone's waiting for the latch. It can be the current process if
     473             :      * we're in a signal handler.
     474             :      *
     475             :      * Use a local variable here just in case somebody changes the event field
     476             :      * concurrently (which really should not happen).
     477             :      */
     478             :     handle = latch->event;
     479             :     if (handle)
     480             :     {
     481             :         SetEvent(handle);
     482             : 
     483             :         /*
     484             :          * Note that we silently ignore any errors. We might be in a signal
     485             :          * handler or other critical path where it's not safe to call elog().
     486             :          */
     487             :     }
     488             : #endif
     489             : 
     490             : }
     491             : 
     492             : /*
     493             :  * Clear the latch. Calling WaitLatch after this will sleep, unless
     494             :  * the latch is set again before the WaitLatch call.
     495             :  */
     496             : void
     497        5016 : ResetLatch(volatile Latch *latch)
     498             : {
     499             :     /* Only the owner should reset the latch */
     500        5016 :     Assert(latch->owner_pid == MyProcPid);
     501             : 
     502        5016 :     latch->is_set = false;
     503             : 
     504             :     /*
     505             :      * Ensure that the write to is_set gets flushed to main memory before we
     506             :      * examine any flag variables.  Otherwise a concurrent SetLatch might
     507             :      * falsely conclude that it needn't signal us, even though we have missed
     508             :      * seeing some flag updates that SetLatch was supposed to inform us of.
     509             :      */
     510        5016 :     pg_memory_barrier();
     511        5016 : }
     512             : 
     513             : /*
     514             :  * Create a WaitEventSet with space for nevents different events to wait for.
     515             :  *
     516             :  * These events can then be efficiently waited upon together, using
     517             :  * WaitEventSetWait().
     518             :  */
     519             : WaitEventSet *
     520        4949 : CreateWaitEventSet(MemoryContext context, int nevents)
     521             : {
     522             :     WaitEventSet *set;
     523             :     char       *data;
     524        4949 :     Size        sz = 0;
     525             : 
     526             :     /*
     527             :      * Use MAXALIGN size/alignment to guarantee that later uses of memory are
     528             :      * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
     529             :      * platforms, but earlier allocations like WaitEventSet and WaitEvent
     530             :      * might not sized to guarantee that when purely using sizeof().
     531             :      */
     532        4949 :     sz += MAXALIGN(sizeof(WaitEventSet));
     533        4949 :     sz += MAXALIGN(sizeof(WaitEvent) * nevents);
     534             : 
     535             : #if defined(WAIT_USE_EPOLL)
     536        4949 :     sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
     537             : #elif defined(WAIT_USE_POLL)
     538             :     sz += MAXALIGN(sizeof(struct pollfd) * nevents);
     539             : #elif defined(WAIT_USE_WIN32)
     540             :     /* need space for the pgwin32_signal_event */
     541             :     sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
     542             : #endif
     543             : 
     544        4949 :     data = (char *) MemoryContextAllocZero(context, sz);
     545             : 
     546        4949 :     set = (WaitEventSet *) data;
     547        4949 :     data += MAXALIGN(sizeof(WaitEventSet));
     548             : 
     549        4949 :     set->events = (WaitEvent *) data;
     550        4949 :     data += MAXALIGN(sizeof(WaitEvent) * nevents);
     551             : 
     552             : #if defined(WAIT_USE_EPOLL)
     553        4949 :     set->epoll_ret_events = (struct epoll_event *) data;
     554        4949 :     data += MAXALIGN(sizeof(struct epoll_event) * nevents);
     555             : #elif defined(WAIT_USE_POLL)
     556             :     set->pollfds = (struct pollfd *) data;
     557             :     data += MAXALIGN(sizeof(struct pollfd) * nevents);
     558             : #elif defined(WAIT_USE_WIN32)
     559             :     set->handles = (HANDLE) data;
     560             :     data += MAXALIGN(sizeof(HANDLE) * nevents);
     561             : #endif
     562             : 
     563        4949 :     set->latch = NULL;
     564        4949 :     set->nevents_space = nevents;
     565             : 
     566             : #if defined(WAIT_USE_EPOLL)
     567             : #ifdef EPOLL_CLOEXEC
     568        4949 :     set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
     569        4949 :     if (set->epoll_fd < 0)
     570           0 :         elog(ERROR, "epoll_create1 failed: %m");
     571             : #else
     572             :     /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
     573             :     set->epoll_fd = epoll_create(nevents);
     574             :     if (set->epoll_fd < 0)
     575             :         elog(ERROR, "epoll_create failed: %m");
     576             :     if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
     577             :         elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
     578             : #endif                          /* EPOLL_CLOEXEC */
     579             : #elif defined(WAIT_USE_WIN32)
     580             : 
     581             :     /*
     582             :      * To handle signals while waiting, we need to add a win32 specific event.
     583             :      * We accounted for the additional event at the top of this routine. See
     584             :      * port/win32/signal.c for more details.
     585             :      *
     586             :      * Note: pgwin32_signal_event should be first to ensure that it will be
     587             :      * reported when multiple events are set.  We want to guarantee that
     588             :      * pending signals are serviced.
     589             :      */
     590             :     set->handles[0] = pgwin32_signal_event;
     591             :     StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
     592             : #endif
     593             : 
     594        4949 :     return set;
     595             : }
     596             : 
     597             : /*
     598             :  * Free a previously created WaitEventSet.
     599             :  *
     600             :  * Note: preferably, this shouldn't have to free any resources that could be
     601             :  * inherited across an exec().  If it did, we'd likely leak those resources in
     602             :  * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
     603             :  * when the FD is created.  For the Windows case, we assume that the handles
     604             :  * involved are non-inheritable.
     605             :  */
     606             : void
     607        4734 : FreeWaitEventSet(WaitEventSet *set)
     608             : {
     609             : #if defined(WAIT_USE_EPOLL)
     610        4734 :     close(set->epoll_fd);
     611             : #elif defined(WAIT_USE_WIN32)
     612             :     WaitEvent  *cur_event;
     613             : 
     614             :     for (cur_event = set->events;
     615             :          cur_event < (set->events + set->nevents);
     616             :          cur_event++)
     617             :     {
     618             :         if (cur_event->events & WL_LATCH_SET)
     619             :         {
     620             :             /* uses the latch's HANDLE */
     621             :         }
     622             :         else if (cur_event->events & WL_POSTMASTER_DEATH)
     623             :         {
     624             :             /* uses PostmasterHandle */
     625             :         }
     626             :         else
     627             :         {
     628             :             /* Clean up the event object we created for the socket */
     629             :             WSAEventSelect(cur_event->fd, NULL, 0);
     630             :             WSACloseEvent(set->handles[cur_event->pos + 1]);
     631             :         }
     632             :     }
     633             : #endif
     634             : 
     635        4734 :     pfree(set);
     636        4734 : }
     637             : 
     638             : /* ---
     639             :  * Add an event to the set. Possible events are:
     640             :  * - WL_LATCH_SET: Wait for the latch to be set
     641             :  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
     642             :  * - WL_SOCKET_READABLE: Wait for socket to become readable,
     643             :  *   can be combined in one event with other WL_SOCKET_* events
     644             :  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
     645             :  *   can be combined with other WL_SOCKET_* events
     646             :  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
     647             :  *   can be combined with other WL_SOCKET_* events (on non-Windows
     648             :  *   platforms, this is the same as WL_SOCKET_WRITEABLE)
     649             :  *
     650             :  * Returns the offset in WaitEventSet->events (starting from 0), which can be
     651             :  * used to modify previously added wait events using ModifyWaitEvent().
     652             :  *
     653             :  * In the WL_LATCH_SET case the latch must be owned by the current process,
     654             :  * i.e. it must be a process-local latch initialized with InitLatch, or a
     655             :  * shared latch associated with the current process by calling OwnLatch.
     656             :  *
     657             :  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
     658             :  * conditions cause the socket to be reported as readable/writable/connected,
     659             :  * so that the caller can deal with the condition.
     660             :  *
     661             :  * The user_data pointer specified here will be set for the events returned
     662             :  * by WaitEventSetWait(), allowing to easily associate additional data with
     663             :  * events.
     664             :  */
     665             : int
     666       11447 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
     667             :                   void *user_data)
     668             : {
     669             :     WaitEvent  *event;
     670             : 
     671             :     /* not enough space */
     672       11447 :     Assert(set->nevents < set->nevents_space);
     673             : 
     674       11447 :     if (latch)
     675             :     {
     676        4949 :         if (latch->owner_pid != MyProcPid)
     677           0 :             elog(ERROR, "cannot wait on a latch owned by another process");
     678        4949 :         if (set->latch)
     679           0 :             elog(ERROR, "cannot wait on more than one latch");
     680        4949 :         if ((events & WL_LATCH_SET) != WL_LATCH_SET)
     681           0 :             elog(ERROR, "latch events only support being set");
     682             :     }
     683             :     else
     684             :     {
     685        6498 :         if (events & WL_LATCH_SET)
     686           0 :             elog(ERROR, "cannot wait on latch without a specified latch");
     687             :     }
     688             : 
     689             :     /* waiting for socket readiness without a socket indicates a bug */
     690       11447 :     if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
     691           0 :         elog(ERROR, "cannot wait on socket event without a socket");
     692             : 
     693       11447 :     event = &set->events[set->nevents];
     694       11447 :     event->pos = set->nevents++;
     695       11447 :     event->fd = fd;
     696       11447 :     event->events = events;
     697       11447 :     event->user_data = user_data;
     698             : #ifdef WIN32
     699             :     event->reset = false;
     700             : #endif
     701             : 
     702       11447 :     if (events == WL_LATCH_SET)
     703             :     {
     704        4949 :         set->latch = latch;
     705        4949 :         set->latch_pos = event->pos;
     706             : #ifndef WIN32
     707        4949 :         event->fd = selfpipe_readfd;
     708             : #endif
     709             :     }
     710        6498 :     else if (events == WL_POSTMASTER_DEATH)
     711             :     {
     712             : #ifndef WIN32
     713        4073 :         event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
     714             : #endif
     715             :     }
     716             : 
     717             :     /* perform wait primitive specific initialization, if needed */
     718             : #if defined(WAIT_USE_EPOLL)
     719       11447 :     WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
     720             : #elif defined(WAIT_USE_POLL)
     721             :     WaitEventAdjustPoll(set, event);
     722             : #elif defined(WAIT_USE_WIN32)
     723             :     WaitEventAdjustWin32(set, event);
     724             : #endif
     725             : 
     726       11447 :     return event->pos;
     727             : }
     728             : 
     729             : /*
     730             :  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
     731             :  * with the WaitEvent.
     732             :  *
     733             :  * 'pos' is the id returned by AddWaitEventToSet.
     734             :  */
     735             : void
     736       15709 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
     737             : {
     738             :     WaitEvent  *event;
     739             : 
     740       15709 :     Assert(pos < set->nevents);
     741             : 
     742       15709 :     event = &set->events[pos];
     743             : 
     744             :     /*
     745             :      * If neither the event mask nor the associated latch changes, return
     746             :      * early. That's an important optimization for some sockets, where
     747             :      * ModifyWaitEvent is frequently used to switch from waiting for reads to
     748             :      * waiting on writes.
     749             :      */
     750       31207 :     if (events == event->events &&
     751       15928 :         (!(event->events & WL_LATCH_SET) || set->latch == latch))
     752       30777 :         return;
     753             : 
     754        1071 :     if (event->events & WL_LATCH_SET &&
     755         430 :         events != event->events)
     756             :     {
     757             :         /* we could allow to disable latch events for a while */
     758           0 :         elog(ERROR, "cannot modify latch event");
     759             :     }
     760             : 
     761         641 :     if (event->events & WL_POSTMASTER_DEATH)
     762             :     {
     763           0 :         elog(ERROR, "cannot modify postmaster death event");
     764             :     }
     765             : 
     766             :     /* FIXME: validate event mask */
     767         641 :     event->events = events;
     768             : 
     769         641 :     if (events == WL_LATCH_SET)
     770             :     {
     771         430 :         set->latch = latch;
     772             :     }
     773             : 
     774             : #if defined(WAIT_USE_EPOLL)
     775         641 :     WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
     776             : #elif defined(WAIT_USE_POLL)
     777             :     WaitEventAdjustPoll(set, event);
     778             : #elif defined(WAIT_USE_WIN32)
     779             :     WaitEventAdjustWin32(set, event);
     780             : #endif
     781             : }
     782             : 
     783             : #if defined(WAIT_USE_EPOLL)
     784             : /*
     785             :  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
     786             :  */
     787             : static void
     788       12088 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
     789             : {
     790             :     struct epoll_event epoll_ev;
     791             :     int         rc;
     792             : 
     793             :     /* pointer to our event, returned by epoll_wait */
     794       12088 :     epoll_ev.data.ptr = event;
     795             :     /* always wait for errors */
     796       12088 :     epoll_ev.events = EPOLLERR | EPOLLHUP;
     797             : 
     798             :     /* prepare pollfd entry once */
     799       12088 :     if (event->events == WL_LATCH_SET)
     800             :     {
     801        5379 :         Assert(set->latch != NULL);
     802        5379 :         epoll_ev.events |= EPOLLIN;
     803             :     }
     804        6709 :     else if (event->events == WL_POSTMASTER_DEATH)
     805             :     {
     806        4073 :         epoll_ev.events |= EPOLLIN;
     807             :     }
     808             :     else
     809             :     {
     810        2636 :         Assert(event->fd != PGINVALID_SOCKET);
     811        2636 :         Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
     812             : 
     813        2636 :         if (event->events & WL_SOCKET_READABLE)
     814        2421 :             epoll_ev.events |= EPOLLIN;
     815        2636 :         if (event->events & WL_SOCKET_WRITEABLE)
     816         215 :             epoll_ev.events |= EPOLLOUT;
     817             :     }
     818             : 
     819             :     /*
     820             :      * Even though unused, we also pass epoll_ev as the data argument if
     821             :      * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
     822             :      * requiring that, and actually it makes the code simpler...
     823             :      */
     824       12088 :     rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
     825             : 
     826       12088 :     if (rc < 0)
     827           0 :         ereport(ERROR,
     828             :                 (errcode_for_socket_access(),
     829             :                  errmsg("epoll_ctl() failed: %m")));
     830       12088 : }
     831             : #endif
     832             : 
     833             : #if defined(WAIT_USE_POLL)
     834             : static void
     835             : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
     836             : {
     837             :     struct pollfd *pollfd = &set->pollfds[event->pos];
     838             : 
     839             :     pollfd->revents = 0;
     840             :     pollfd->fd = event->fd;
     841             : 
     842             :     /* prepare pollfd entry once */
     843             :     if (event->events == WL_LATCH_SET)
     844             :     {
     845             :         Assert(set->latch != NULL);
     846             :         pollfd->events = POLLIN;
     847             :     }
     848             :     else if (event->events == WL_POSTMASTER_DEATH)
     849             :     {
     850             :         pollfd->events = POLLIN;
     851             :     }
     852             :     else
     853             :     {
     854             :         Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
     855             :         pollfd->events = 0;
     856             :         if (event->events & WL_SOCKET_READABLE)
     857             :             pollfd->events |= POLLIN;
     858             :         if (event->events & WL_SOCKET_WRITEABLE)
     859             :             pollfd->events |= POLLOUT;
     860             :     }
     861             : 
     862             :     Assert(event->fd != PGINVALID_SOCKET);
     863             : }
     864             : #endif
     865             : 
     866             : #if defined(WAIT_USE_WIN32)
     867             : static void
     868             : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
     869             : {
     870             :     HANDLE     *handle = &set->handles[event->pos + 1];
     871             : 
     872             :     if (event->events == WL_LATCH_SET)
     873             :     {
     874             :         Assert(set->latch != NULL);
     875             :         *handle = set->latch->event;
     876             :     }
     877             :     else if (event->events == WL_POSTMASTER_DEATH)
     878             :     {
     879             :         *handle = PostmasterHandle;
     880             :     }
     881             :     else
     882             :     {
     883             :         int         flags = FD_CLOSE;   /* always check for errors/EOF */
     884             : 
     885             :         if (event->events & WL_SOCKET_READABLE)
     886             :             flags |= FD_READ;
     887             :         if (event->events & WL_SOCKET_WRITEABLE)
     888             :             flags |= FD_WRITE;
     889             :         if (event->events & WL_SOCKET_CONNECTED)
     890             :             flags |= FD_CONNECT;
     891             : 
     892             :         if (*handle == WSA_INVALID_EVENT)
     893             :         {
     894             :             *handle = WSACreateEvent();
     895             :             if (*handle == WSA_INVALID_EVENT)
     896             :                 elog(ERROR, "failed to create event for socket: error code %u",
     897             :                      WSAGetLastError());
     898             :         }
     899             :         if (WSAEventSelect(event->fd, *handle, flags) != 0)
     900             :             elog(ERROR, "failed to set up event for socket: error code %u",
     901             :                  WSAGetLastError());
     902             : 
     903             :         Assert(event->fd != PGINVALID_SOCKET);
     904             :     }
     905             : }
     906             : #endif
     907             : 
     908             : /*
     909             :  * Wait for events added to the set to happen, or until the timeout is
     910             :  * reached.  At most nevents occurred events are returned.
     911             :  *
     912             :  * If timeout = -1, block until an event occurs; if 0, check sockets for
     913             :  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
     914             :  *
     915             :  * Returns the number of events occurred, or 0 if the timeout was reached.
     916             :  *
     917             :  * Returned events will have the fd, pos, user_data fields set to the
     918             :  * values associated with the registered event.
     919             :  */
     920             : int
     921       20013 : WaitEventSetWait(WaitEventSet *set, long timeout,
     922             :                  WaitEvent *occurred_events, int nevents,
     923             :                  uint32 wait_event_info)
     924             : {
     925       20013 :     int         returned_events = 0;
     926             :     instr_time  start_time;
     927             :     instr_time  cur_time;
     928       20013 :     long        cur_timeout = -1;
     929             : 
     930       20013 :     Assert(nevents > 0);
     931             : 
     932             :     /*
     933             :      * Initialize timeout if requested.  We must record the current time so
     934             :      * that we can determine the remaining timeout if interrupted.
     935             :      */
     936       20013 :     if (timeout >= 0)
     937             :     {
     938        1578 :         INSTR_TIME_SET_CURRENT(start_time);
     939        1578 :         Assert(timeout >= 0 && timeout <= INT_MAX);
     940        1578 :         cur_timeout = timeout;
     941             :     }
     942             : 
     943       20013 :     pgstat_report_wait_start(wait_event_info);
     944             : 
     945             : #ifndef WIN32
     946       20013 :     waiting = true;
     947             : #else
     948             :     /* Ensure that signals are serviced even if latch is already set */
     949             :     pgwin32_dispatch_queued_signals();
     950             : #endif
     951       60332 :     while (returned_events == 0)
     952             :     {
     953             :         int         rc;
     954             : 
     955             :         /*
     956             :          * Check if the latch is set already. If so, leave the loop
     957             :          * immediately, avoid blocking again. We don't attempt to report any
     958             :          * other events that might also be satisfied.
     959             :          *
     960             :          * If someone sets the latch between this and the
     961             :          * WaitEventSetWaitBlock() below, the setter will write a byte to the
     962             :          * pipe (or signal us and the signal handler will do that), and the
     963             :          * readiness routine will return immediately.
     964             :          *
     965             :          * On unix, If there's a pending byte in the self pipe, we'll notice
     966             :          * whenever blocking. Only clearing the pipe in that case avoids
     967             :          * having to drain it every time WaitLatchOrSocket() is used. Should
     968             :          * the pipe-buffer fill up we're still ok, because the pipe is in
     969             :          * nonblocking mode. It's unlikely for that to happen, because the
     970             :          * self pipe isn't filled unless we're blocking (waiting = true), or
     971             :          * from inside a signal handler in latch_sigusr1_handler().
     972             :          *
     973             :          * On windows, we'll also notice if there's a pending event for the
     974             :          * latch when blocking, but there's no danger of anything filling up,
     975             :          * as "Setting an event that is already set has no effect.".
     976             :          *
     977             :          * Note: we assume that the kernel calls involved in latch management
     978             :          * will provide adequate synchronization on machines with weak memory
     979             :          * ordering, so that we cannot miss seeing is_set if a notification
     980             :          * has already been queued.
     981             :          */
     982       23106 :         if (set->latch && set->latch->is_set)
     983             :         {
     984        1891 :             occurred_events->fd = PGINVALID_SOCKET;
     985        1891 :             occurred_events->pos = set->latch_pos;
     986        1891 :             occurred_events->user_data =
     987        1891 :                 set->events[set->latch_pos].user_data;
     988        1891 :             occurred_events->events = WL_LATCH_SET;
     989        1891 :             occurred_events++;
     990        1891 :             returned_events++;
     991             : 
     992        1891 :             break;
     993             :         }
     994             : 
     995             :         /*
     996             :          * Wait for events using the readiness primitive chosen at the top of
     997             :          * this file. If -1 is returned, a timeout has occurred, if 0 we have
     998             :          * to retry, everything >= 1 is the number of returned events.
     999             :          */
    1000       21215 :         rc = WaitEventSetWaitBlock(set, cur_timeout,
    1001             :                                    occurred_events, nevents);
    1002             : 
    1003       21215 :         if (rc == -1)
    1004         908 :             break;              /* timeout occurred */
    1005             :         else
    1006       20307 :             returned_events = rc;
    1007             : 
    1008             :         /* If we're not done, update cur_timeout for next iteration */
    1009       20307 :         if (returned_events == 0 && timeout >= 0)
    1010             :         {
    1011        1321 :             INSTR_TIME_SET_CURRENT(cur_time);
    1012        1321 :             INSTR_TIME_SUBTRACT(cur_time, start_time);
    1013        1321 :             cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
    1014        1321 :             if (cur_timeout <= 0)
    1015           1 :                 break;
    1016             :         }
    1017             :     }
    1018             : #ifndef WIN32
    1019       20013 :     waiting = false;
    1020             : #endif
    1021             : 
    1022       20013 :     pgstat_report_wait_end();
    1023             : 
    1024       20013 :     return returned_events;
    1025             : }
    1026             : 
    1027             : 
    1028             : #if defined(WAIT_USE_EPOLL)
    1029             : 
    1030             : /*
    1031             :  * Wait using linux's epoll_wait(2).
    1032             :  *
    1033             :  * This is the preferrable wait method, as several readiness notifications are
    1034             :  * delivered, without having to iterate through all of set->events. The return
    1035             :  * epoll_event struct contain a pointer to our events, making association
    1036             :  * easy.
    1037             :  */
    1038             : static inline int
    1039       21215 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1040             :                       WaitEvent *occurred_events, int nevents)
    1041             : {
    1042       21215 :     int         returned_events = 0;
    1043             :     int         rc;
    1044             :     WaitEvent  *cur_event;
    1045             :     struct epoll_event *cur_epoll_event;
    1046             : 
    1047             :     /* Sleep */
    1048       21215 :     rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
    1049             :                     nevents, cur_timeout);
    1050             : 
    1051             :     /* Check return code */
    1052       21215 :     if (rc < 0)
    1053             :     {
    1054             :         /* EINTR is okay, otherwise complain */
    1055        1550 :         if (errno != EINTR)
    1056             :         {
    1057           0 :             waiting = false;
    1058           0 :             ereport(ERROR,
    1059             :                     (errcode_for_socket_access(),
    1060             :                      errmsg("epoll_wait() failed: %m")));
    1061             :         }
    1062        1550 :         return 0;
    1063             :     }
    1064       19665 :     else if (rc == 0)
    1065             :     {
    1066             :         /* timeout exceeded */
    1067         908 :         return -1;
    1068             :     }
    1069             : 
    1070             :     /*
    1071             :      * At least one event occurred, iterate over the returned epoll events
    1072             :      * until they're either all processed, or we've returned all the events
    1073             :      * the caller desired.
    1074             :      */
    1075       56271 :     for (cur_epoll_event = set->epoll_ret_events;
    1076       56271 :          cur_epoll_event < (set->epoll_ret_events + rc) &&
    1077             :          returned_events < nevents;
    1078       18757 :          cur_epoll_event++)
    1079             :     {
    1080             :         /* epoll's data pointer is set to the associated WaitEvent */
    1081       18757 :         cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
    1082             : 
    1083       18757 :         occurred_events->pos = cur_event->pos;
    1084       18757 :         occurred_events->user_data = cur_event->user_data;
    1085       18757 :         occurred_events->events = 0;
    1086             : 
    1087       20304 :         if (cur_event->events == WL_LATCH_SET &&
    1088        1547 :             cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
    1089             :         {
    1090             :             /* There's data in the self-pipe, clear it. */
    1091        1547 :             drainSelfPipe();
    1092             : 
    1093        3094 :             if (set->latch->is_set)
    1094             :             {
    1095           3 :                 occurred_events->fd = PGINVALID_SOCKET;
    1096           3 :                 occurred_events->events = WL_LATCH_SET;
    1097           3 :                 occurred_events++;
    1098           3 :                 returned_events++;
    1099             :             }
    1100             :         }
    1101       17210 :         else if (cur_event->events == WL_POSTMASTER_DEATH &&
    1102           0 :                  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
    1103             :         {
    1104             :             /*
    1105             :              * We expect an EPOLLHUP when the remote end is closed, but
    1106             :              * because we don't expect the pipe to become readable or to have
    1107             :              * any errors either, treat those cases as postmaster death, too.
    1108             :              *
    1109             :              * Be paranoid about a spurious event signalling the postmaster as
    1110             :              * being dead.  There have been reports about that happening with
    1111             :              * older primitives (select(2) to be specific), and a spurious
    1112             :              * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
    1113             :              * cost much.
    1114             :              */
    1115           0 :             if (!PostmasterIsAlive())
    1116             :             {
    1117           0 :                 occurred_events->fd = PGINVALID_SOCKET;
    1118           0 :                 occurred_events->events = WL_POSTMASTER_DEATH;
    1119           0 :                 occurred_events++;
    1120           0 :                 returned_events++;
    1121             :             }
    1122             :         }
    1123       17210 :         else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
    1124             :         {
    1125       17210 :             Assert(cur_event->fd != PGINVALID_SOCKET);
    1126             : 
    1127       34420 :             if ((cur_event->events & WL_SOCKET_READABLE) &&
    1128       17210 :                 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
    1129             :             {
    1130             :                 /* data available in socket, or EOF */
    1131       17210 :                 occurred_events->events |= WL_SOCKET_READABLE;
    1132             :             }
    1133             : 
    1134       17210 :             if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1135           0 :                 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
    1136             :             {
    1137             :                 /* writable, or EOF */
    1138           0 :                 occurred_events->events |= WL_SOCKET_WRITEABLE;
    1139             :             }
    1140             : 
    1141       17210 :             if (occurred_events->events != 0)
    1142             :             {
    1143       17210 :                 occurred_events->fd = cur_event->fd;
    1144       17210 :                 occurred_events++;
    1145       17210 :                 returned_events++;
    1146             :             }
    1147             :         }
    1148             :     }
    1149             : 
    1150       18757 :     return returned_events;
    1151             : }
    1152             : 
    1153             : #elif defined(WAIT_USE_POLL)
    1154             : 
    1155             : /*
    1156             :  * Wait using poll(2).
    1157             :  *
    1158             :  * This allows to receive readiness notifications for several events at once,
    1159             :  * but requires iterating through all of set->pollfds.
    1160             :  */
    1161             : static inline int
    1162             : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1163             :                       WaitEvent *occurred_events, int nevents)
    1164             : {
    1165             :     int         returned_events = 0;
    1166             :     int         rc;
    1167             :     WaitEvent  *cur_event;
    1168             :     struct pollfd *cur_pollfd;
    1169             : 
    1170             :     /* Sleep */
    1171             :     rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
    1172             : 
    1173             :     /* Check return code */
    1174             :     if (rc < 0)
    1175             :     {
    1176             :         /* EINTR is okay, otherwise complain */
    1177             :         if (errno != EINTR)
    1178             :         {
    1179             :             waiting = false;
    1180             :             ereport(ERROR,
    1181             :                     (errcode_for_socket_access(),
    1182             :                      errmsg("poll() failed: %m")));
    1183             :         }
    1184             :         return 0;
    1185             :     }
    1186             :     else if (rc == 0)
    1187             :     {
    1188             :         /* timeout exceeded */
    1189             :         return -1;
    1190             :     }
    1191             : 
    1192             :     for (cur_event = set->events, cur_pollfd = set->pollfds;
    1193             :          cur_event < (set->events + set->nevents) &&
    1194             :          returned_events < nevents;
    1195             :          cur_event++, cur_pollfd++)
    1196             :     {
    1197             :         /* no activity on this FD, skip */
    1198             :         if (cur_pollfd->revents == 0)
    1199             :             continue;
    1200             : 
    1201             :         occurred_events->pos = cur_event->pos;
    1202             :         occurred_events->user_data = cur_event->user_data;
    1203             :         occurred_events->events = 0;
    1204             : 
    1205             :         if (cur_event->events == WL_LATCH_SET &&
    1206             :             (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
    1207             :         {
    1208             :             /* There's data in the self-pipe, clear it. */
    1209             :             drainSelfPipe();
    1210             : 
    1211             :             if (set->latch->is_set)
    1212             :             {
    1213             :                 occurred_events->fd = PGINVALID_SOCKET;
    1214             :                 occurred_events->events = WL_LATCH_SET;
    1215             :                 occurred_events++;
    1216             :                 returned_events++;
    1217             :             }
    1218             :         }
    1219             :         else if (cur_event->events == WL_POSTMASTER_DEATH &&
    1220             :                  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
    1221             :         {
    1222             :             /*
    1223             :              * We expect an POLLHUP when the remote end is closed, but because
    1224             :              * we don't expect the pipe to become readable or to have any
    1225             :              * errors either, treat those cases as postmaster death, too.
    1226             :              *
    1227             :              * Be paranoid about a spurious event signalling the postmaster as
    1228             :              * being dead.  There have been reports about that happening with
    1229             :              * older primitives (select(2) to be specific), and a spurious
    1230             :              * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
    1231             :              * cost much.
    1232             :              */
    1233             :             if (!PostmasterIsAlive())
    1234             :             {
    1235             :                 occurred_events->fd = PGINVALID_SOCKET;
    1236             :                 occurred_events->events = WL_POSTMASTER_DEATH;
    1237             :                 occurred_events++;
    1238             :                 returned_events++;
    1239             :             }
    1240             :         }
    1241             :         else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
    1242             :         {
    1243             :             int         errflags = POLLHUP | POLLERR | POLLNVAL;
    1244             : 
    1245             :             Assert(cur_event->fd >= PGINVALID_SOCKET);
    1246             : 
    1247             :             if ((cur_event->events & WL_SOCKET_READABLE) &&
    1248             :                 (cur_pollfd->revents & (POLLIN | errflags)))
    1249             :             {
    1250             :                 /* data available in socket, or EOF */
    1251             :                 occurred_events->events |= WL_SOCKET_READABLE;
    1252             :             }
    1253             : 
    1254             :             if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1255             :                 (cur_pollfd->revents & (POLLOUT | errflags)))
    1256             :             {
    1257             :                 /* writeable, or EOF */
    1258             :                 occurred_events->events |= WL_SOCKET_WRITEABLE;
    1259             :             }
    1260             : 
    1261             :             if (occurred_events->events != 0)
    1262             :             {
    1263             :                 occurred_events->fd = cur_event->fd;
    1264             :                 occurred_events++;
    1265             :                 returned_events++;
    1266             :             }
    1267             :         }
    1268             :     }
    1269             :     return returned_events;
    1270             : }
    1271             : 
    1272             : #elif defined(WAIT_USE_WIN32)
    1273             : 
    1274             : /*
    1275             :  * Wait using Windows' WaitForMultipleObjects().
    1276             :  *
    1277             :  * Unfortunately this will only ever return a single readiness notification at
    1278             :  * a time.  Note that while the official documentation for
    1279             :  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
    1280             :  * with a single bWaitAll = FALSE call,
    1281             :  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
    1282             :  * that only one event is "consumed".
    1283             :  */
    1284             : static inline int
    1285             : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
    1286             :                       WaitEvent *occurred_events, int nevents)
    1287             : {
    1288             :     int         returned_events = 0;
    1289             :     DWORD       rc;
    1290             :     WaitEvent  *cur_event;
    1291             : 
    1292             :     /* Reset any wait events that need it */
    1293             :     for (cur_event = set->events;
    1294             :          cur_event < (set->events + set->nevents);
    1295             :          cur_event++)
    1296             :     {
    1297             :         if (cur_event->reset)
    1298             :         {
    1299             :             WaitEventAdjustWin32(set, cur_event);
    1300             :             cur_event->reset = false;
    1301             :         }
    1302             : 
    1303             :         /*
    1304             :          * Windows does not guarantee to log an FD_WRITE network event
    1305             :          * indicating that more data can be sent unless the previous send()
    1306             :          * failed with WSAEWOULDBLOCK.  While our caller might well have made
    1307             :          * such a call, we cannot assume that here.  Therefore, if waiting for
    1308             :          * write-ready, force the issue by doing a dummy send().  If the dummy
    1309             :          * send() succeeds, assume that the socket is in fact write-ready, and
    1310             :          * return immediately.  Also, if it fails with something other than
    1311             :          * WSAEWOULDBLOCK, return a write-ready indication to let our caller
    1312             :          * deal with the error condition.
    1313             :          */
    1314             :         if (cur_event->events & WL_SOCKET_WRITEABLE)
    1315             :         {
    1316             :             char        c;
    1317             :             WSABUF      buf;
    1318             :             DWORD       sent;
    1319             :             int         r;
    1320             : 
    1321             :             buf.buf = &c;
    1322             :             buf.len = 0;
    1323             : 
    1324             :             r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
    1325             :             if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
    1326             :             {
    1327             :                 occurred_events->pos = cur_event->pos;
    1328             :                 occurred_events->user_data = cur_event->user_data;
    1329             :                 occurred_events->events = WL_SOCKET_WRITEABLE;
    1330             :                 occurred_events->fd = cur_event->fd;
    1331             :                 return 1;
    1332             :             }
    1333             :         }
    1334             :     }
    1335             : 
    1336             :     /*
    1337             :      * Sleep.
    1338             :      *
    1339             :      * Need to wait for ->nevents + 1, because signal handle is in [0].
    1340             :      */
    1341             :     rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
    1342             :                                 cur_timeout);
    1343             : 
    1344             :     /* Check return code */
    1345             :     if (rc == WAIT_FAILED)
    1346             :         elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
    1347             :              GetLastError());
    1348             :     else if (rc == WAIT_TIMEOUT)
    1349             :     {
    1350             :         /* timeout exceeded */
    1351             :         return -1;
    1352             :     }
    1353             : 
    1354             :     if (rc == WAIT_OBJECT_0)
    1355             :     {
    1356             :         /* Service newly-arrived signals */
    1357             :         pgwin32_dispatch_queued_signals();
    1358             :         return 0;               /* retry */
    1359             :     }
    1360             : 
    1361             :     /*
    1362             :      * With an offset of one, due to the always present pgwin32_signal_event,
    1363             :      * the handle offset directly corresponds to a wait event.
    1364             :      */
    1365             :     cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
    1366             : 
    1367             :     occurred_events->pos = cur_event->pos;
    1368             :     occurred_events->user_data = cur_event->user_data;
    1369             :     occurred_events->events = 0;
    1370             : 
    1371             :     if (cur_event->events == WL_LATCH_SET)
    1372             :     {
    1373             :         if (!ResetEvent(set->latch->event))
    1374             :             elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
    1375             : 
    1376             :         if (set->latch->is_set)
    1377             :         {
    1378             :             occurred_events->fd = PGINVALID_SOCKET;
    1379             :             occurred_events->events = WL_LATCH_SET;
    1380             :             occurred_events++;
    1381             :             returned_events++;
    1382             :         }
    1383             :     }
    1384             :     else if (cur_event->events == WL_POSTMASTER_DEATH)
    1385             :     {
    1386             :         /*
    1387             :          * Postmaster apparently died.  Since the consequences of falsely
    1388             :          * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
    1389             :          * the trouble to positively verify this with PostmasterIsAlive(),
    1390             :          * even though there is no known reason to think that the event could
    1391             :          * be falsely set on Windows.
    1392             :          */
    1393             :         if (!PostmasterIsAlive())
    1394             :         {
    1395             :             occurred_events->fd = PGINVALID_SOCKET;
    1396             :             occurred_events->events = WL_POSTMASTER_DEATH;
    1397             :             occurred_events++;
    1398             :             returned_events++;
    1399             :         }
    1400             :     }
    1401             :     else if (cur_event->events & WL_SOCKET_MASK)
    1402             :     {
    1403             :         WSANETWORKEVENTS resEvents;
    1404             :         HANDLE      handle = set->handles[cur_event->pos + 1];
    1405             : 
    1406             :         Assert(cur_event->fd);
    1407             : 
    1408             :         occurred_events->fd = cur_event->fd;
    1409             : 
    1410             :         ZeroMemory(&resEvents, sizeof(resEvents));
    1411             :         if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
    1412             :             elog(ERROR, "failed to enumerate network events: error code %u",
    1413             :                  WSAGetLastError());
    1414             :         if ((cur_event->events & WL_SOCKET_READABLE) &&
    1415             :             (resEvents.lNetworkEvents & FD_READ))
    1416             :         {
    1417             :             /* data available in socket */
    1418             :             occurred_events->events |= WL_SOCKET_READABLE;
    1419             : 
    1420             :             /*------
    1421             :              * WaitForMultipleObjects doesn't guarantee that a read event will
    1422             :              * be returned if the latch is set at the same time.  Even if it
    1423             :              * did, the caller might drop that event expecting it to reoccur
    1424             :              * on next call.  So, we must force the event to be reset if this
    1425             :              * WaitEventSet is used again in order to avoid an indefinite
    1426             :              * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
    1427             :              * for the behavior of socket events.
    1428             :              *------
    1429             :              */
    1430             :             cur_event->reset = true;
    1431             :         }
    1432             :         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
    1433             :             (resEvents.lNetworkEvents & FD_WRITE))
    1434             :         {
    1435             :             /* writeable */
    1436             :             occurred_events->events |= WL_SOCKET_WRITEABLE;
    1437             :         }
    1438             :         if ((cur_event->events & WL_SOCKET_CONNECTED) &&
    1439             :             (resEvents.lNetworkEvents & FD_CONNECT))
    1440             :         {
    1441             :             /* connected */
    1442             :             occurred_events->events |= WL_SOCKET_CONNECTED;
    1443             :         }
    1444             :         if (resEvents.lNetworkEvents & FD_CLOSE)
    1445             :         {
    1446             :             /* EOF/error, so signal all caller-requested socket flags */
    1447             :             occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
    1448             :         }
    1449             : 
    1450             :         if (occurred_events->events != 0)
    1451             :         {
    1452             :             occurred_events++;
    1453             :             returned_events++;
    1454             :         }
    1455             :     }
    1456             : 
    1457             :     return returned_events;
    1458             : }
    1459             : #endif
    1460             : 
    1461             : /*
    1462             :  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
    1463             :  *
    1464             :  * Wake up WaitLatch, if we're waiting.  (We might not be, since SIGUSR1 is
    1465             :  * overloaded for multiple purposes; or we might not have reached WaitLatch
    1466             :  * yet, in which case we don't need to fill the pipe either.)
    1467             :  *
    1468             :  * NB: when calling this in a signal handler, be sure to save and restore
    1469             :  * errno around it.
    1470             :  */
    1471             : #ifndef WIN32
    1472             : void
    1473        1767 : latch_sigusr1_handler(void)
    1474             : {
    1475        1767 :     if (waiting)
    1476        1583 :         sendSelfPipeByte();
    1477        1767 : }
    1478             : #endif                          /* !WIN32 */
    1479             : 
    1480             : /* Send one byte to the self-pipe, to wake up WaitLatch */
    1481             : #ifndef WIN32
    1482             : static void
    1483        2045 : sendSelfPipeByte(void)
    1484             : {
    1485             :     int         rc;
    1486        2045 :     char        dummy = 0;
    1487             : 
    1488             : retry:
    1489        2045 :     rc = write(selfpipe_writefd, &dummy, 1);
    1490        2045 :     if (rc < 0)
    1491             :     {
    1492             :         /* If interrupted by signal, just retry */
    1493           0 :         if (errno == EINTR)
    1494           0 :             goto retry;
    1495             : 
    1496             :         /*
    1497             :          * If the pipe is full, we don't need to retry, the data that's there
    1498             :          * already is enough to wake up WaitLatch.
    1499             :          */
    1500           0 :         if (errno == EAGAIN || errno == EWOULDBLOCK)
    1501           0 :             return;
    1502             : 
    1503             :         /*
    1504             :          * Oops, the write() failed for some other reason. We might be in a
    1505             :          * signal handler, so it's not safe to elog(). We have no choice but
    1506             :          * silently ignore the error.
    1507             :          */
    1508           0 :         return;
    1509             :     }
    1510             : }
    1511             : #endif                          /* !WIN32 */
    1512             : 
    1513             : /*
    1514             :  * Read all available data from the self-pipe
    1515             :  *
    1516             :  * Note: this is only called when waiting = true.  If it fails and doesn't
    1517             :  * return, it must reset that flag first (though ideally, this will never
    1518             :  * happen).
    1519             :  */
    1520             : #ifndef WIN32
    1521             : static void
    1522        1547 : drainSelfPipe(void)
    1523             : {
    1524             :     /*
    1525             :      * There shouldn't normally be more than one byte in the pipe, or maybe a
    1526             :      * few bytes if multiple processes run SetLatch at the same instant.
    1527             :      */
    1528             :     char        buf[16];
    1529             :     int         rc;
    1530             : 
    1531             :     for (;;)
    1532             :     {
    1533        1547 :         rc = read(selfpipe_readfd, buf, sizeof(buf));
    1534        1547 :         if (rc < 0)
    1535             :         {
    1536           0 :             if (errno == EAGAIN || errno == EWOULDBLOCK)
    1537             :                 break;          /* the pipe is empty */
    1538           0 :             else if (errno == EINTR)
    1539           0 :                 continue;       /* retry */
    1540             :             else
    1541             :             {
    1542           0 :                 waiting = false;
    1543           0 :                 elog(ERROR, "read() on self-pipe failed: %m");
    1544             :             }
    1545             :         }
    1546        1547 :         else if (rc == 0)
    1547             :         {
    1548           0 :             waiting = false;
    1549           0 :             elog(ERROR, "unexpected EOF on self-pipe");
    1550             :         }
    1551        1547 :         else if (rc < sizeof(buf))
    1552             :         {
    1553             :             /* we successfully drained the pipe; no need to read() again */
    1554        1547 :             break;
    1555             :         }
    1556             :         /* else buffer wasn't big enough, so read again */
    1557           0 :     }
    1558        1547 : }
    1559             : #endif                          /* !WIN32 */

Generated by: LCOV version 1.11