Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * latch.c
4 : * Routines for inter-process latches
5 : *
6 : * The Unix implementation uses the so-called self-pipe trick to overcome the
7 : * race condition involved with poll() (or epoll_wait() on linux) and setting
8 : * a global flag in the signal handler. When a latch is set and the current
9 : * process is waiting for it, the signal handler wakes up the poll() in
10 : * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11 : * poll() on all platforms, and even on platforms where it does, a signal that
12 : * arrives just before the poll() call does not prevent poll() from entering
13 : * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14 : * and causes poll() to return immediately even if the signal arrives before
15 : * poll() begins.
16 : *
17 : * When SetLatch is called from the same process that owns the latch,
18 : * SetLatch writes the byte directly to the pipe. If it's owned by another
19 : * process, SIGUSR1 is sent and the signal handler in the waiting process
20 : * writes the byte to the pipe on behalf of the signaling process.
21 : *
22 : * The Windows implementation uses Windows events that are inherited by all
23 : * postmaster child processes. There's no need for the self-pipe trick there.
24 : *
25 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
26 : * Portions Copyright (c) 1994, Regents of the University of California
27 : *
28 : * IDENTIFICATION
29 : * src/backend/storage/ipc/latch.c
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include <fcntl.h>
36 : #include <limits.h>
37 : #include <signal.h>
38 : #include <unistd.h>
39 : #ifdef HAVE_SYS_EPOLL_H
40 : #include <sys/epoll.h>
41 : #endif
42 : #ifdef HAVE_POLL_H
43 : #include <poll.h>
44 : #endif
45 :
46 : #include "miscadmin.h"
47 : #include "pgstat.h"
48 : #include "port/atomics.h"
49 : #include "portability/instr_time.h"
50 : #include "postmaster/postmaster.h"
51 : #include "storage/latch.h"
52 : #include "storage/pmsignal.h"
53 : #include "storage/shmem.h"
54 :
55 : /*
56 : * Select the fd readiness primitive to use. Normally the "most modern"
57 : * primitive supported by the OS will be used, but for testing it can be
58 : * useful to manually specify the used primitive. If desired, just add a
59 : * define somewhere before this block.
60 : */
61 : #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
62 : defined(WAIT_USE_WIN32)
63 : /* don't overwrite manual choice */
64 : #elif defined(HAVE_SYS_EPOLL_H)
65 : #define WAIT_USE_EPOLL
66 : #elif defined(HAVE_POLL)
67 : #define WAIT_USE_POLL
68 : #elif WIN32
69 : #define WAIT_USE_WIN32
70 : #else
71 : #error "no wait set implementation available"
72 : #endif
73 :
74 : /* typedef in latch.h */
75 : struct WaitEventSet
76 : {
77 : int nevents; /* number of registered events */
78 : int nevents_space; /* maximum number of events in this set */
79 :
80 : /*
81 : * Array, of nevents_space length, storing the definition of events this
82 : * set is waiting for.
83 : */
84 : WaitEvent *events;
85 :
86 : /*
87 : * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
88 : * said latch, and latch_pos the offset in the ->events array. This is
89 : * useful because we check the state of the latch before performing doing
90 : * syscalls related to waiting.
91 : */
92 : Latch *latch;
93 : int latch_pos;
94 :
95 : #if defined(WAIT_USE_EPOLL)
96 : int epoll_fd;
97 : /* epoll_wait returns events in a user provided arrays, allocate once */
98 : struct epoll_event *epoll_ret_events;
99 : #elif defined(WAIT_USE_POLL)
100 : /* poll expects events to be waited on every poll() call, prepare once */
101 : struct pollfd *pollfds;
102 : #elif defined(WAIT_USE_WIN32)
103 :
104 : /*
105 : * Array of windows events. The first element always contains
106 : * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
107 : * event->pos + 1).
108 : */
109 : HANDLE *handles;
110 : #endif
111 : };
112 :
113 : #ifndef WIN32
114 : /* Are we currently in WaitLatch? The signal handler would like to know. */
115 : static volatile sig_atomic_t waiting = false;
116 :
117 : /* Read and write ends of the self-pipe */
118 : static int selfpipe_readfd = -1;
119 : static int selfpipe_writefd = -1;
120 :
121 : /* Process owning the self-pipe --- needed for checking purposes */
122 : static int selfpipe_owner_pid = 0;
123 :
124 : /* Private function prototypes */
125 : static void sendSelfPipeByte(void);
126 : static void drainSelfPipe(void);
127 : #endif /* WIN32 */
128 :
129 : #if defined(WAIT_USE_EPOLL)
130 : static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
131 : #elif defined(WAIT_USE_POLL)
132 : static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
133 : #elif defined(WAIT_USE_WIN32)
134 : static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
135 : #endif
136 :
137 : static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
138 : WaitEvent *occurred_events, int nevents);
139 :
140 : /*
141 : * Initialize the process-local latch infrastructure.
142 : *
143 : * This must be called once during startup of any process that can wait on
144 : * latches, before it issues any InitLatch() or OwnLatch() calls.
145 : */
146 : void
147 345 : InitializeLatchSupport(void)
148 : {
149 : #ifndef WIN32
150 : int pipefd[2];
151 :
152 345 : if (IsUnderPostmaster)
153 : {
154 : /*
155 : * We might have inherited connections to a self-pipe created by the
156 : * postmaster. It's critical that child processes create their own
157 : * self-pipes, of course, and we really want them to close the
158 : * inherited FDs for safety's sake.
159 : */
160 341 : if (selfpipe_owner_pid != 0)
161 : {
162 : /* Assert we go through here but once in a child process */
163 0 : Assert(selfpipe_owner_pid != MyProcPid);
164 : /* Release postmaster's pipe FDs; ignore any error */
165 0 : (void) close(selfpipe_readfd);
166 0 : (void) close(selfpipe_writefd);
167 : /* Clean up, just for safety's sake; we'll set these below */
168 0 : selfpipe_readfd = selfpipe_writefd = -1;
169 0 : selfpipe_owner_pid = 0;
170 : }
171 : else
172 : {
173 : /*
174 : * Postmaster didn't create a self-pipe ... or else we're in an
175 : * EXEC_BACKEND build, in which case it doesn't matter since the
176 : * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
177 : */
178 341 : Assert(selfpipe_readfd == -1);
179 : }
180 : }
181 : else
182 : {
183 : /* In postmaster or standalone backend, assert we do this but once */
184 4 : Assert(selfpipe_readfd == -1);
185 4 : Assert(selfpipe_owner_pid == 0);
186 : }
187 :
188 : /*
189 : * Set up the self-pipe that allows a signal handler to wake up the
190 : * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
191 : * that SetLatch won't block if the event has already been set many times
192 : * filling the kernel buffer. Make the read-end non-blocking too, so that
193 : * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
194 : * Also, make both FDs close-on-exec, since we surely do not want any
195 : * child processes messing with them.
196 : */
197 345 : if (pipe(pipefd) < 0)
198 0 : elog(FATAL, "pipe() failed: %m");
199 345 : if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
200 0 : elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
201 345 : if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
202 0 : elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
203 345 : if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
204 0 : elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
205 345 : if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
206 0 : elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
207 :
208 345 : selfpipe_readfd = pipefd[0];
209 345 : selfpipe_writefd = pipefd[1];
210 345 : selfpipe_owner_pid = MyProcPid;
211 : #else
212 : /* currently, nothing to do here for Windows */
213 : #endif
214 345 : }
215 :
216 : /*
217 : * Initialize a process-local latch.
218 : */
219 : void
220 345 : InitLatch(volatile Latch *latch)
221 : {
222 345 : latch->is_set = false;
223 345 : latch->owner_pid = MyProcPid;
224 345 : latch->is_shared = false;
225 :
226 : #ifndef WIN32
227 : /* Assert InitializeLatchSupport has been called in this process */
228 345 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
229 : #else
230 : latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
231 : if (latch->event == NULL)
232 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
233 : #endif /* WIN32 */
234 345 : }
235 :
236 : /*
237 : * Initialize a shared latch that can be set from other processes. The latch
238 : * is initially owned by no-one; use OwnLatch to associate it with the
239 : * current process.
240 : *
241 : * InitSharedLatch needs to be called in postmaster before forking child
242 : * processes, usually right after allocating the shared memory block
243 : * containing the latch with ShmemInitStruct. (The Unix implementation
244 : * doesn't actually require that, but the Windows one does.) Because of
245 : * this restriction, we have no concurrency issues to worry about here.
246 : *
247 : * Note that other handles created in this module are never marked as
248 : * inheritable. Thus we do not need to worry about cleaning up child
249 : * process references to postmaster-private latches or WaitEventSets.
250 : */
251 : void
252 585 : InitSharedLatch(volatile Latch *latch)
253 : {
254 : #ifdef WIN32
255 : SECURITY_ATTRIBUTES sa;
256 :
257 : /*
258 : * Set up security attributes to specify that the events are inherited.
259 : */
260 : ZeroMemory(&sa, sizeof(sa));
261 : sa.nLength = sizeof(sa);
262 : sa.bInheritHandle = TRUE;
263 :
264 : latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
265 : if (latch->event == NULL)
266 : elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
267 : #endif
268 :
269 585 : latch->is_set = false;
270 585 : latch->owner_pid = 0;
271 585 : latch->is_shared = true;
272 585 : }
273 :
274 : /*
275 : * Associate a shared latch with the current process, allowing it to
276 : * wait on the latch.
277 : *
278 : * Although there is a sanity check for latch-already-owned, we don't do
279 : * any sort of locking here, meaning that we could fail to detect the error
280 : * if two processes try to own the same latch at about the same time. If
281 : * there is any risk of that, caller must provide an interlock to prevent it.
282 : *
283 : * In any process that calls OwnLatch(), make sure that
284 : * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
285 : * as shared latches use SIGUSR1 for inter-process communication.
286 : */
287 : void
288 342 : OwnLatch(volatile Latch *latch)
289 : {
290 : /* Sanity checks */
291 342 : Assert(latch->is_shared);
292 :
293 : #ifndef WIN32
294 : /* Assert InitializeLatchSupport has been called in this process */
295 342 : Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
296 : #endif
297 :
298 342 : if (latch->owner_pid != 0)
299 0 : elog(ERROR, "latch already owned");
300 :
301 342 : latch->owner_pid = MyProcPid;
302 342 : }
303 :
304 : /*
305 : * Disown a shared latch currently owned by the current process.
306 : */
307 : void
308 342 : DisownLatch(volatile Latch *latch)
309 : {
310 342 : Assert(latch->is_shared);
311 342 : Assert(latch->owner_pid == MyProcPid);
312 :
313 342 : latch->owner_pid = 0;
314 342 : }
315 :
316 : /*
317 : * Wait for a given latch to be set, or for postmaster death, or until timeout
318 : * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
319 : * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
320 : * function returns immediately.
321 : *
322 : * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
323 : * is given. Although it is declared as "long", we don't actually support
324 : * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
325 : * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
326 : *
327 : * The latch must be owned by the current process, ie. it must be a
328 : * process-local latch initialized with InitLatch, or a shared latch
329 : * associated with the current process by calling OwnLatch.
330 : *
331 : * Returns bit mask indicating which condition(s) caused the wake-up. Note
332 : * that if multiple wake-up conditions are true, there is no guarantee that
333 : * we return all of them in one call, but we will return at least one.
334 : */
335 : int
336 2524 : WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
337 : uint32 wait_event_info)
338 : {
339 2524 : return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
340 : wait_event_info);
341 : }
342 :
343 : /*
344 : * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
345 : * conditions.
346 : *
347 : * When waiting on a socket, EOF and error conditions always cause the socket
348 : * to be reported as readable/writable/connected, so that the caller can deal
349 : * with the condition.
350 : *
351 : * NB: These days this is just a wrapper around the WaitEventSet API. When
352 : * using a latch very frequently, consider creating a longer living
353 : * WaitEventSet instead; that's more efficient.
354 : */
355 : int
356 4734 : WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
357 : long timeout, uint32 wait_event_info)
358 : {
359 4734 : int ret = 0;
360 : int rc;
361 : WaitEvent event;
362 4734 : WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
363 :
364 4734 : if (wakeEvents & WL_TIMEOUT)
365 1578 : Assert(timeout >= 0);
366 : else
367 3156 : timeout = -1;
368 :
369 4734 : if (wakeEvents & WL_LATCH_SET)
370 4734 : AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
371 : (Latch *) latch, NULL);
372 :
373 4734 : if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)
374 3858 : AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
375 : NULL, NULL);
376 :
377 4734 : if (wakeEvents & WL_SOCKET_MASK)
378 : {
379 : int ev;
380 :
381 2210 : ev = wakeEvents & WL_SOCKET_MASK;
382 2210 : AddWaitEventToSet(set, ev, sock, NULL, NULL);
383 : }
384 :
385 4734 : rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
386 :
387 4734 : if (rc == 0)
388 909 : ret |= WL_TIMEOUT;
389 : else
390 : {
391 3825 : ret |= event.events & (WL_LATCH_SET |
392 : WL_POSTMASTER_DEATH |
393 : WL_SOCKET_MASK);
394 : }
395 :
396 4734 : FreeWaitEventSet(set);
397 :
398 4734 : return ret;
399 : }
400 :
401 : /*
402 : * Sets a latch and wakes up anyone waiting on it.
403 : *
404 : * This is cheap if the latch is already set, otherwise not so much.
405 : *
406 : * NB: when calling this in a signal handler, be sure to save and restore
407 : * errno around it. (That's standard practice in most signal handlers, of
408 : * course, but we used to omit it in handlers that only set a flag.)
409 : *
410 : * NB: this function is called from critical sections and signal handlers so
411 : * throwing an error is not a good idea.
412 : */
413 : void
414 4125 : SetLatch(volatile Latch *latch)
415 : {
416 : #ifndef WIN32
417 : pid_t owner_pid;
418 : #else
419 : HANDLE handle;
420 : #endif
421 :
422 : /*
423 : * The memory barrier has to be placed here to ensure that any flag
424 : * variables possibly changed by this process have been flushed to main
425 : * memory, before we check/set is_set.
426 : */
427 4125 : pg_memory_barrier();
428 :
429 : /* Quick exit if already set */
430 4125 : if (latch->is_set)
431 1875 : return;
432 :
433 2250 : latch->is_set = true;
434 :
435 : #ifndef WIN32
436 :
437 : /*
438 : * See if anyone's waiting for the latch. It can be the current process if
439 : * we're in a signal handler. We use the self-pipe to wake up the
440 : * poll()/epoll_wait() in that case. If it's another process, send a
441 : * signal.
442 : *
443 : * Fetch owner_pid only once, in case the latch is concurrently getting
444 : * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
445 : * guaranteed to be true! In practice, the effective range of pid_t fits
446 : * in a 32 bit integer, and so should be atomic. In the worst case, we
447 : * might end up signaling the wrong process. Even then, you're very
448 : * unlucky if a process with that bogus pid exists and belongs to
449 : * Postgres; and PG database processes should handle excess SIGUSR1
450 : * interrupts without a problem anyhow.
451 : *
452 : * Another sort of race condition that's possible here is for a new
453 : * process to own the latch immediately after we look, so we don't signal
454 : * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
455 : * the standard coding convention of waiting at the bottom of their loops,
456 : * not the top, so that they'll correctly process latch-setting events
457 : * that happen before they enter the loop.
458 : */
459 2250 : owner_pid = latch->owner_pid;
460 2250 : if (owner_pid == 0)
461 0 : return;
462 2250 : else if (owner_pid == MyProcPid)
463 : {
464 1092 : if (waiting)
465 462 : sendSelfPipeByte();
466 : }
467 : else
468 1158 : kill(owner_pid, SIGUSR1);
469 : #else
470 :
471 : /*
472 : * See if anyone's waiting for the latch. It can be the current process if
473 : * we're in a signal handler.
474 : *
475 : * Use a local variable here just in case somebody changes the event field
476 : * concurrently (which really should not happen).
477 : */
478 : handle = latch->event;
479 : if (handle)
480 : {
481 : SetEvent(handle);
482 :
483 : /*
484 : * Note that we silently ignore any errors. We might be in a signal
485 : * handler or other critical path where it's not safe to call elog().
486 : */
487 : }
488 : #endif
489 :
490 : }
491 :
492 : /*
493 : * Clear the latch. Calling WaitLatch after this will sleep, unless
494 : * the latch is set again before the WaitLatch call.
495 : */
496 : void
497 5016 : ResetLatch(volatile Latch *latch)
498 : {
499 : /* Only the owner should reset the latch */
500 5016 : Assert(latch->owner_pid == MyProcPid);
501 :
502 5016 : latch->is_set = false;
503 :
504 : /*
505 : * Ensure that the write to is_set gets flushed to main memory before we
506 : * examine any flag variables. Otherwise a concurrent SetLatch might
507 : * falsely conclude that it needn't signal us, even though we have missed
508 : * seeing some flag updates that SetLatch was supposed to inform us of.
509 : */
510 5016 : pg_memory_barrier();
511 5016 : }
512 :
513 : /*
514 : * Create a WaitEventSet with space for nevents different events to wait for.
515 : *
516 : * These events can then be efficiently waited upon together, using
517 : * WaitEventSetWait().
518 : */
519 : WaitEventSet *
520 4949 : CreateWaitEventSet(MemoryContext context, int nevents)
521 : {
522 : WaitEventSet *set;
523 : char *data;
524 4949 : Size sz = 0;
525 :
526 : /*
527 : * Use MAXALIGN size/alignment to guarantee that later uses of memory are
528 : * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
529 : * platforms, but earlier allocations like WaitEventSet and WaitEvent
530 : * might not sized to guarantee that when purely using sizeof().
531 : */
532 4949 : sz += MAXALIGN(sizeof(WaitEventSet));
533 4949 : sz += MAXALIGN(sizeof(WaitEvent) * nevents);
534 :
535 : #if defined(WAIT_USE_EPOLL)
536 4949 : sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
537 : #elif defined(WAIT_USE_POLL)
538 : sz += MAXALIGN(sizeof(struct pollfd) * nevents);
539 : #elif defined(WAIT_USE_WIN32)
540 : /* need space for the pgwin32_signal_event */
541 : sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
542 : #endif
543 :
544 4949 : data = (char *) MemoryContextAllocZero(context, sz);
545 :
546 4949 : set = (WaitEventSet *) data;
547 4949 : data += MAXALIGN(sizeof(WaitEventSet));
548 :
549 4949 : set->events = (WaitEvent *) data;
550 4949 : data += MAXALIGN(sizeof(WaitEvent) * nevents);
551 :
552 : #if defined(WAIT_USE_EPOLL)
553 4949 : set->epoll_ret_events = (struct epoll_event *) data;
554 4949 : data += MAXALIGN(sizeof(struct epoll_event) * nevents);
555 : #elif defined(WAIT_USE_POLL)
556 : set->pollfds = (struct pollfd *) data;
557 : data += MAXALIGN(sizeof(struct pollfd) * nevents);
558 : #elif defined(WAIT_USE_WIN32)
559 : set->handles = (HANDLE) data;
560 : data += MAXALIGN(sizeof(HANDLE) * nevents);
561 : #endif
562 :
563 4949 : set->latch = NULL;
564 4949 : set->nevents_space = nevents;
565 :
566 : #if defined(WAIT_USE_EPOLL)
567 : #ifdef EPOLL_CLOEXEC
568 4949 : set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
569 4949 : if (set->epoll_fd < 0)
570 0 : elog(ERROR, "epoll_create1 failed: %m");
571 : #else
572 : /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
573 : set->epoll_fd = epoll_create(nevents);
574 : if (set->epoll_fd < 0)
575 : elog(ERROR, "epoll_create failed: %m");
576 : if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
577 : elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
578 : #endif /* EPOLL_CLOEXEC */
579 : #elif defined(WAIT_USE_WIN32)
580 :
581 : /*
582 : * To handle signals while waiting, we need to add a win32 specific event.
583 : * We accounted for the additional event at the top of this routine. See
584 : * port/win32/signal.c for more details.
585 : *
586 : * Note: pgwin32_signal_event should be first to ensure that it will be
587 : * reported when multiple events are set. We want to guarantee that
588 : * pending signals are serviced.
589 : */
590 : set->handles[0] = pgwin32_signal_event;
591 : StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
592 : #endif
593 :
594 4949 : return set;
595 : }
596 :
597 : /*
598 : * Free a previously created WaitEventSet.
599 : *
600 : * Note: preferably, this shouldn't have to free any resources that could be
601 : * inherited across an exec(). If it did, we'd likely leak those resources in
602 : * many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
603 : * when the FD is created. For the Windows case, we assume that the handles
604 : * involved are non-inheritable.
605 : */
606 : void
607 4734 : FreeWaitEventSet(WaitEventSet *set)
608 : {
609 : #if defined(WAIT_USE_EPOLL)
610 4734 : close(set->epoll_fd);
611 : #elif defined(WAIT_USE_WIN32)
612 : WaitEvent *cur_event;
613 :
614 : for (cur_event = set->events;
615 : cur_event < (set->events + set->nevents);
616 : cur_event++)
617 : {
618 : if (cur_event->events & WL_LATCH_SET)
619 : {
620 : /* uses the latch's HANDLE */
621 : }
622 : else if (cur_event->events & WL_POSTMASTER_DEATH)
623 : {
624 : /* uses PostmasterHandle */
625 : }
626 : else
627 : {
628 : /* Clean up the event object we created for the socket */
629 : WSAEventSelect(cur_event->fd, NULL, 0);
630 : WSACloseEvent(set->handles[cur_event->pos + 1]);
631 : }
632 : }
633 : #endif
634 :
635 4734 : pfree(set);
636 4734 : }
637 :
638 : /* ---
639 : * Add an event to the set. Possible events are:
640 : * - WL_LATCH_SET: Wait for the latch to be set
641 : * - WL_POSTMASTER_DEATH: Wait for postmaster to die
642 : * - WL_SOCKET_READABLE: Wait for socket to become readable,
643 : * can be combined in one event with other WL_SOCKET_* events
644 : * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
645 : * can be combined with other WL_SOCKET_* events
646 : * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
647 : * can be combined with other WL_SOCKET_* events (on non-Windows
648 : * platforms, this is the same as WL_SOCKET_WRITEABLE)
649 : *
650 : * Returns the offset in WaitEventSet->events (starting from 0), which can be
651 : * used to modify previously added wait events using ModifyWaitEvent().
652 : *
653 : * In the WL_LATCH_SET case the latch must be owned by the current process,
654 : * i.e. it must be a process-local latch initialized with InitLatch, or a
655 : * shared latch associated with the current process by calling OwnLatch.
656 : *
657 : * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
658 : * conditions cause the socket to be reported as readable/writable/connected,
659 : * so that the caller can deal with the condition.
660 : *
661 : * The user_data pointer specified here will be set for the events returned
662 : * by WaitEventSetWait(), allowing to easily associate additional data with
663 : * events.
664 : */
665 : int
666 11447 : AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
667 : void *user_data)
668 : {
669 : WaitEvent *event;
670 :
671 : /* not enough space */
672 11447 : Assert(set->nevents < set->nevents_space);
673 :
674 11447 : if (latch)
675 : {
676 4949 : if (latch->owner_pid != MyProcPid)
677 0 : elog(ERROR, "cannot wait on a latch owned by another process");
678 4949 : if (set->latch)
679 0 : elog(ERROR, "cannot wait on more than one latch");
680 4949 : if ((events & WL_LATCH_SET) != WL_LATCH_SET)
681 0 : elog(ERROR, "latch events only support being set");
682 : }
683 : else
684 : {
685 6498 : if (events & WL_LATCH_SET)
686 0 : elog(ERROR, "cannot wait on latch without a specified latch");
687 : }
688 :
689 : /* waiting for socket readiness without a socket indicates a bug */
690 11447 : if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
691 0 : elog(ERROR, "cannot wait on socket event without a socket");
692 :
693 11447 : event = &set->events[set->nevents];
694 11447 : event->pos = set->nevents++;
695 11447 : event->fd = fd;
696 11447 : event->events = events;
697 11447 : event->user_data = user_data;
698 : #ifdef WIN32
699 : event->reset = false;
700 : #endif
701 :
702 11447 : if (events == WL_LATCH_SET)
703 : {
704 4949 : set->latch = latch;
705 4949 : set->latch_pos = event->pos;
706 : #ifndef WIN32
707 4949 : event->fd = selfpipe_readfd;
708 : #endif
709 : }
710 6498 : else if (events == WL_POSTMASTER_DEATH)
711 : {
712 : #ifndef WIN32
713 4073 : event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
714 : #endif
715 : }
716 :
717 : /* perform wait primitive specific initialization, if needed */
718 : #if defined(WAIT_USE_EPOLL)
719 11447 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
720 : #elif defined(WAIT_USE_POLL)
721 : WaitEventAdjustPoll(set, event);
722 : #elif defined(WAIT_USE_WIN32)
723 : WaitEventAdjustWin32(set, event);
724 : #endif
725 :
726 11447 : return event->pos;
727 : }
728 :
729 : /*
730 : * Change the event mask and, in the WL_LATCH_SET case, the latch associated
731 : * with the WaitEvent.
732 : *
733 : * 'pos' is the id returned by AddWaitEventToSet.
734 : */
735 : void
736 15709 : ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
737 : {
738 : WaitEvent *event;
739 :
740 15709 : Assert(pos < set->nevents);
741 :
742 15709 : event = &set->events[pos];
743 :
744 : /*
745 : * If neither the event mask nor the associated latch changes, return
746 : * early. That's an important optimization for some sockets, where
747 : * ModifyWaitEvent is frequently used to switch from waiting for reads to
748 : * waiting on writes.
749 : */
750 31207 : if (events == event->events &&
751 15928 : (!(event->events & WL_LATCH_SET) || set->latch == latch))
752 30777 : return;
753 :
754 1071 : if (event->events & WL_LATCH_SET &&
755 430 : events != event->events)
756 : {
757 : /* we could allow to disable latch events for a while */
758 0 : elog(ERROR, "cannot modify latch event");
759 : }
760 :
761 641 : if (event->events & WL_POSTMASTER_DEATH)
762 : {
763 0 : elog(ERROR, "cannot modify postmaster death event");
764 : }
765 :
766 : /* FIXME: validate event mask */
767 641 : event->events = events;
768 :
769 641 : if (events == WL_LATCH_SET)
770 : {
771 430 : set->latch = latch;
772 : }
773 :
774 : #if defined(WAIT_USE_EPOLL)
775 641 : WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
776 : #elif defined(WAIT_USE_POLL)
777 : WaitEventAdjustPoll(set, event);
778 : #elif defined(WAIT_USE_WIN32)
779 : WaitEventAdjustWin32(set, event);
780 : #endif
781 : }
782 :
783 : #if defined(WAIT_USE_EPOLL)
784 : /*
785 : * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
786 : */
787 : static void
788 12088 : WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
789 : {
790 : struct epoll_event epoll_ev;
791 : int rc;
792 :
793 : /* pointer to our event, returned by epoll_wait */
794 12088 : epoll_ev.data.ptr = event;
795 : /* always wait for errors */
796 12088 : epoll_ev.events = EPOLLERR | EPOLLHUP;
797 :
798 : /* prepare pollfd entry once */
799 12088 : if (event->events == WL_LATCH_SET)
800 : {
801 5379 : Assert(set->latch != NULL);
802 5379 : epoll_ev.events |= EPOLLIN;
803 : }
804 6709 : else if (event->events == WL_POSTMASTER_DEATH)
805 : {
806 4073 : epoll_ev.events |= EPOLLIN;
807 : }
808 : else
809 : {
810 2636 : Assert(event->fd != PGINVALID_SOCKET);
811 2636 : Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
812 :
813 2636 : if (event->events & WL_SOCKET_READABLE)
814 2421 : epoll_ev.events |= EPOLLIN;
815 2636 : if (event->events & WL_SOCKET_WRITEABLE)
816 215 : epoll_ev.events |= EPOLLOUT;
817 : }
818 :
819 : /*
820 : * Even though unused, we also pass epoll_ev as the data argument if
821 : * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
822 : * requiring that, and actually it makes the code simpler...
823 : */
824 12088 : rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
825 :
826 12088 : if (rc < 0)
827 0 : ereport(ERROR,
828 : (errcode_for_socket_access(),
829 : errmsg("epoll_ctl() failed: %m")));
830 12088 : }
831 : #endif
832 :
833 : #if defined(WAIT_USE_POLL)
834 : static void
835 : WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
836 : {
837 : struct pollfd *pollfd = &set->pollfds[event->pos];
838 :
839 : pollfd->revents = 0;
840 : pollfd->fd = event->fd;
841 :
842 : /* prepare pollfd entry once */
843 : if (event->events == WL_LATCH_SET)
844 : {
845 : Assert(set->latch != NULL);
846 : pollfd->events = POLLIN;
847 : }
848 : else if (event->events == WL_POSTMASTER_DEATH)
849 : {
850 : pollfd->events = POLLIN;
851 : }
852 : else
853 : {
854 : Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
855 : pollfd->events = 0;
856 : if (event->events & WL_SOCKET_READABLE)
857 : pollfd->events |= POLLIN;
858 : if (event->events & WL_SOCKET_WRITEABLE)
859 : pollfd->events |= POLLOUT;
860 : }
861 :
862 : Assert(event->fd != PGINVALID_SOCKET);
863 : }
864 : #endif
865 :
866 : #if defined(WAIT_USE_WIN32)
867 : static void
868 : WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
869 : {
870 : HANDLE *handle = &set->handles[event->pos + 1];
871 :
872 : if (event->events == WL_LATCH_SET)
873 : {
874 : Assert(set->latch != NULL);
875 : *handle = set->latch->event;
876 : }
877 : else if (event->events == WL_POSTMASTER_DEATH)
878 : {
879 : *handle = PostmasterHandle;
880 : }
881 : else
882 : {
883 : int flags = FD_CLOSE; /* always check for errors/EOF */
884 :
885 : if (event->events & WL_SOCKET_READABLE)
886 : flags |= FD_READ;
887 : if (event->events & WL_SOCKET_WRITEABLE)
888 : flags |= FD_WRITE;
889 : if (event->events & WL_SOCKET_CONNECTED)
890 : flags |= FD_CONNECT;
891 :
892 : if (*handle == WSA_INVALID_EVENT)
893 : {
894 : *handle = WSACreateEvent();
895 : if (*handle == WSA_INVALID_EVENT)
896 : elog(ERROR, "failed to create event for socket: error code %u",
897 : WSAGetLastError());
898 : }
899 : if (WSAEventSelect(event->fd, *handle, flags) != 0)
900 : elog(ERROR, "failed to set up event for socket: error code %u",
901 : WSAGetLastError());
902 :
903 : Assert(event->fd != PGINVALID_SOCKET);
904 : }
905 : }
906 : #endif
907 :
908 : /*
909 : * Wait for events added to the set to happen, or until the timeout is
910 : * reached. At most nevents occurred events are returned.
911 : *
912 : * If timeout = -1, block until an event occurs; if 0, check sockets for
913 : * readiness, but don't block; if > 0, block for at most timeout milliseconds.
914 : *
915 : * Returns the number of events occurred, or 0 if the timeout was reached.
916 : *
917 : * Returned events will have the fd, pos, user_data fields set to the
918 : * values associated with the registered event.
919 : */
920 : int
921 20013 : WaitEventSetWait(WaitEventSet *set, long timeout,
922 : WaitEvent *occurred_events, int nevents,
923 : uint32 wait_event_info)
924 : {
925 20013 : int returned_events = 0;
926 : instr_time start_time;
927 : instr_time cur_time;
928 20013 : long cur_timeout = -1;
929 :
930 20013 : Assert(nevents > 0);
931 :
932 : /*
933 : * Initialize timeout if requested. We must record the current time so
934 : * that we can determine the remaining timeout if interrupted.
935 : */
936 20013 : if (timeout >= 0)
937 : {
938 1578 : INSTR_TIME_SET_CURRENT(start_time);
939 1578 : Assert(timeout >= 0 && timeout <= INT_MAX);
940 1578 : cur_timeout = timeout;
941 : }
942 :
943 20013 : pgstat_report_wait_start(wait_event_info);
944 :
945 : #ifndef WIN32
946 20013 : waiting = true;
947 : #else
948 : /* Ensure that signals are serviced even if latch is already set */
949 : pgwin32_dispatch_queued_signals();
950 : #endif
951 60332 : while (returned_events == 0)
952 : {
953 : int rc;
954 :
955 : /*
956 : * Check if the latch is set already. If so, leave the loop
957 : * immediately, avoid blocking again. We don't attempt to report any
958 : * other events that might also be satisfied.
959 : *
960 : * If someone sets the latch between this and the
961 : * WaitEventSetWaitBlock() below, the setter will write a byte to the
962 : * pipe (or signal us and the signal handler will do that), and the
963 : * readiness routine will return immediately.
964 : *
965 : * On unix, If there's a pending byte in the self pipe, we'll notice
966 : * whenever blocking. Only clearing the pipe in that case avoids
967 : * having to drain it every time WaitLatchOrSocket() is used. Should
968 : * the pipe-buffer fill up we're still ok, because the pipe is in
969 : * nonblocking mode. It's unlikely for that to happen, because the
970 : * self pipe isn't filled unless we're blocking (waiting = true), or
971 : * from inside a signal handler in latch_sigusr1_handler().
972 : *
973 : * On windows, we'll also notice if there's a pending event for the
974 : * latch when blocking, but there's no danger of anything filling up,
975 : * as "Setting an event that is already set has no effect.".
976 : *
977 : * Note: we assume that the kernel calls involved in latch management
978 : * will provide adequate synchronization on machines with weak memory
979 : * ordering, so that we cannot miss seeing is_set if a notification
980 : * has already been queued.
981 : */
982 23106 : if (set->latch && set->latch->is_set)
983 : {
984 1891 : occurred_events->fd = PGINVALID_SOCKET;
985 1891 : occurred_events->pos = set->latch_pos;
986 1891 : occurred_events->user_data =
987 1891 : set->events[set->latch_pos].user_data;
988 1891 : occurred_events->events = WL_LATCH_SET;
989 1891 : occurred_events++;
990 1891 : returned_events++;
991 :
992 1891 : break;
993 : }
994 :
995 : /*
996 : * Wait for events using the readiness primitive chosen at the top of
997 : * this file. If -1 is returned, a timeout has occurred, if 0 we have
998 : * to retry, everything >= 1 is the number of returned events.
999 : */
1000 21215 : rc = WaitEventSetWaitBlock(set, cur_timeout,
1001 : occurred_events, nevents);
1002 :
1003 21215 : if (rc == -1)
1004 908 : break; /* timeout occurred */
1005 : else
1006 20307 : returned_events = rc;
1007 :
1008 : /* If we're not done, update cur_timeout for next iteration */
1009 20307 : if (returned_events == 0 && timeout >= 0)
1010 : {
1011 1321 : INSTR_TIME_SET_CURRENT(cur_time);
1012 1321 : INSTR_TIME_SUBTRACT(cur_time, start_time);
1013 1321 : cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1014 1321 : if (cur_timeout <= 0)
1015 1 : break;
1016 : }
1017 : }
1018 : #ifndef WIN32
1019 20013 : waiting = false;
1020 : #endif
1021 :
1022 20013 : pgstat_report_wait_end();
1023 :
1024 20013 : return returned_events;
1025 : }
1026 :
1027 :
1028 : #if defined(WAIT_USE_EPOLL)
1029 :
1030 : /*
1031 : * Wait using linux's epoll_wait(2).
1032 : *
1033 : * This is the preferrable wait method, as several readiness notifications are
1034 : * delivered, without having to iterate through all of set->events. The return
1035 : * epoll_event struct contain a pointer to our events, making association
1036 : * easy.
1037 : */
1038 : static inline int
1039 21215 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1040 : WaitEvent *occurred_events, int nevents)
1041 : {
1042 21215 : int returned_events = 0;
1043 : int rc;
1044 : WaitEvent *cur_event;
1045 : struct epoll_event *cur_epoll_event;
1046 :
1047 : /* Sleep */
1048 21215 : rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1049 : nevents, cur_timeout);
1050 :
1051 : /* Check return code */
1052 21215 : if (rc < 0)
1053 : {
1054 : /* EINTR is okay, otherwise complain */
1055 1550 : if (errno != EINTR)
1056 : {
1057 0 : waiting = false;
1058 0 : ereport(ERROR,
1059 : (errcode_for_socket_access(),
1060 : errmsg("epoll_wait() failed: %m")));
1061 : }
1062 1550 : return 0;
1063 : }
1064 19665 : else if (rc == 0)
1065 : {
1066 : /* timeout exceeded */
1067 908 : return -1;
1068 : }
1069 :
1070 : /*
1071 : * At least one event occurred, iterate over the returned epoll events
1072 : * until they're either all processed, or we've returned all the events
1073 : * the caller desired.
1074 : */
1075 56271 : for (cur_epoll_event = set->epoll_ret_events;
1076 56271 : cur_epoll_event < (set->epoll_ret_events + rc) &&
1077 : returned_events < nevents;
1078 18757 : cur_epoll_event++)
1079 : {
1080 : /* epoll's data pointer is set to the associated WaitEvent */
1081 18757 : cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1082 :
1083 18757 : occurred_events->pos = cur_event->pos;
1084 18757 : occurred_events->user_data = cur_event->user_data;
1085 18757 : occurred_events->events = 0;
1086 :
1087 20304 : if (cur_event->events == WL_LATCH_SET &&
1088 1547 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1089 : {
1090 : /* There's data in the self-pipe, clear it. */
1091 1547 : drainSelfPipe();
1092 :
1093 3094 : if (set->latch->is_set)
1094 : {
1095 3 : occurred_events->fd = PGINVALID_SOCKET;
1096 3 : occurred_events->events = WL_LATCH_SET;
1097 3 : occurred_events++;
1098 3 : returned_events++;
1099 : }
1100 : }
1101 17210 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1102 0 : cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1103 : {
1104 : /*
1105 : * We expect an EPOLLHUP when the remote end is closed, but
1106 : * because we don't expect the pipe to become readable or to have
1107 : * any errors either, treat those cases as postmaster death, too.
1108 : *
1109 : * Be paranoid about a spurious event signalling the postmaster as
1110 : * being dead. There have been reports about that happening with
1111 : * older primitives (select(2) to be specific), and a spurious
1112 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1113 : * cost much.
1114 : */
1115 0 : if (!PostmasterIsAlive())
1116 : {
1117 0 : occurred_events->fd = PGINVALID_SOCKET;
1118 0 : occurred_events->events = WL_POSTMASTER_DEATH;
1119 0 : occurred_events++;
1120 0 : returned_events++;
1121 : }
1122 : }
1123 17210 : else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1124 : {
1125 17210 : Assert(cur_event->fd != PGINVALID_SOCKET);
1126 :
1127 34420 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1128 17210 : (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1129 : {
1130 : /* data available in socket, or EOF */
1131 17210 : occurred_events->events |= WL_SOCKET_READABLE;
1132 : }
1133 :
1134 17210 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1135 0 : (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1136 : {
1137 : /* writable, or EOF */
1138 0 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1139 : }
1140 :
1141 17210 : if (occurred_events->events != 0)
1142 : {
1143 17210 : occurred_events->fd = cur_event->fd;
1144 17210 : occurred_events++;
1145 17210 : returned_events++;
1146 : }
1147 : }
1148 : }
1149 :
1150 18757 : return returned_events;
1151 : }
1152 :
1153 : #elif defined(WAIT_USE_POLL)
1154 :
1155 : /*
1156 : * Wait using poll(2).
1157 : *
1158 : * This allows to receive readiness notifications for several events at once,
1159 : * but requires iterating through all of set->pollfds.
1160 : */
1161 : static inline int
1162 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1163 : WaitEvent *occurred_events, int nevents)
1164 : {
1165 : int returned_events = 0;
1166 : int rc;
1167 : WaitEvent *cur_event;
1168 : struct pollfd *cur_pollfd;
1169 :
1170 : /* Sleep */
1171 : rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1172 :
1173 : /* Check return code */
1174 : if (rc < 0)
1175 : {
1176 : /* EINTR is okay, otherwise complain */
1177 : if (errno != EINTR)
1178 : {
1179 : waiting = false;
1180 : ereport(ERROR,
1181 : (errcode_for_socket_access(),
1182 : errmsg("poll() failed: %m")));
1183 : }
1184 : return 0;
1185 : }
1186 : else if (rc == 0)
1187 : {
1188 : /* timeout exceeded */
1189 : return -1;
1190 : }
1191 :
1192 : for (cur_event = set->events, cur_pollfd = set->pollfds;
1193 : cur_event < (set->events + set->nevents) &&
1194 : returned_events < nevents;
1195 : cur_event++, cur_pollfd++)
1196 : {
1197 : /* no activity on this FD, skip */
1198 : if (cur_pollfd->revents == 0)
1199 : continue;
1200 :
1201 : occurred_events->pos = cur_event->pos;
1202 : occurred_events->user_data = cur_event->user_data;
1203 : occurred_events->events = 0;
1204 :
1205 : if (cur_event->events == WL_LATCH_SET &&
1206 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1207 : {
1208 : /* There's data in the self-pipe, clear it. */
1209 : drainSelfPipe();
1210 :
1211 : if (set->latch->is_set)
1212 : {
1213 : occurred_events->fd = PGINVALID_SOCKET;
1214 : occurred_events->events = WL_LATCH_SET;
1215 : occurred_events++;
1216 : returned_events++;
1217 : }
1218 : }
1219 : else if (cur_event->events == WL_POSTMASTER_DEATH &&
1220 : (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1221 : {
1222 : /*
1223 : * We expect an POLLHUP when the remote end is closed, but because
1224 : * we don't expect the pipe to become readable or to have any
1225 : * errors either, treat those cases as postmaster death, too.
1226 : *
1227 : * Be paranoid about a spurious event signalling the postmaster as
1228 : * being dead. There have been reports about that happening with
1229 : * older primitives (select(2) to be specific), and a spurious
1230 : * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1231 : * cost much.
1232 : */
1233 : if (!PostmasterIsAlive())
1234 : {
1235 : occurred_events->fd = PGINVALID_SOCKET;
1236 : occurred_events->events = WL_POSTMASTER_DEATH;
1237 : occurred_events++;
1238 : returned_events++;
1239 : }
1240 : }
1241 : else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1242 : {
1243 : int errflags = POLLHUP | POLLERR | POLLNVAL;
1244 :
1245 : Assert(cur_event->fd >= PGINVALID_SOCKET);
1246 :
1247 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1248 : (cur_pollfd->revents & (POLLIN | errflags)))
1249 : {
1250 : /* data available in socket, or EOF */
1251 : occurred_events->events |= WL_SOCKET_READABLE;
1252 : }
1253 :
1254 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1255 : (cur_pollfd->revents & (POLLOUT | errflags)))
1256 : {
1257 : /* writeable, or EOF */
1258 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1259 : }
1260 :
1261 : if (occurred_events->events != 0)
1262 : {
1263 : occurred_events->fd = cur_event->fd;
1264 : occurred_events++;
1265 : returned_events++;
1266 : }
1267 : }
1268 : }
1269 : return returned_events;
1270 : }
1271 :
1272 : #elif defined(WAIT_USE_WIN32)
1273 :
1274 : /*
1275 : * Wait using Windows' WaitForMultipleObjects().
1276 : *
1277 : * Unfortunately this will only ever return a single readiness notification at
1278 : * a time. Note that while the official documentation for
1279 : * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1280 : * with a single bWaitAll = FALSE call,
1281 : * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1282 : * that only one event is "consumed".
1283 : */
1284 : static inline int
1285 : WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1286 : WaitEvent *occurred_events, int nevents)
1287 : {
1288 : int returned_events = 0;
1289 : DWORD rc;
1290 : WaitEvent *cur_event;
1291 :
1292 : /* Reset any wait events that need it */
1293 : for (cur_event = set->events;
1294 : cur_event < (set->events + set->nevents);
1295 : cur_event++)
1296 : {
1297 : if (cur_event->reset)
1298 : {
1299 : WaitEventAdjustWin32(set, cur_event);
1300 : cur_event->reset = false;
1301 : }
1302 :
1303 : /*
1304 : * Windows does not guarantee to log an FD_WRITE network event
1305 : * indicating that more data can be sent unless the previous send()
1306 : * failed with WSAEWOULDBLOCK. While our caller might well have made
1307 : * such a call, we cannot assume that here. Therefore, if waiting for
1308 : * write-ready, force the issue by doing a dummy send(). If the dummy
1309 : * send() succeeds, assume that the socket is in fact write-ready, and
1310 : * return immediately. Also, if it fails with something other than
1311 : * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1312 : * deal with the error condition.
1313 : */
1314 : if (cur_event->events & WL_SOCKET_WRITEABLE)
1315 : {
1316 : char c;
1317 : WSABUF buf;
1318 : DWORD sent;
1319 : int r;
1320 :
1321 : buf.buf = &c;
1322 : buf.len = 0;
1323 :
1324 : r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1325 : if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1326 : {
1327 : occurred_events->pos = cur_event->pos;
1328 : occurred_events->user_data = cur_event->user_data;
1329 : occurred_events->events = WL_SOCKET_WRITEABLE;
1330 : occurred_events->fd = cur_event->fd;
1331 : return 1;
1332 : }
1333 : }
1334 : }
1335 :
1336 : /*
1337 : * Sleep.
1338 : *
1339 : * Need to wait for ->nevents + 1, because signal handle is in [0].
1340 : */
1341 : rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1342 : cur_timeout);
1343 :
1344 : /* Check return code */
1345 : if (rc == WAIT_FAILED)
1346 : elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1347 : GetLastError());
1348 : else if (rc == WAIT_TIMEOUT)
1349 : {
1350 : /* timeout exceeded */
1351 : return -1;
1352 : }
1353 :
1354 : if (rc == WAIT_OBJECT_0)
1355 : {
1356 : /* Service newly-arrived signals */
1357 : pgwin32_dispatch_queued_signals();
1358 : return 0; /* retry */
1359 : }
1360 :
1361 : /*
1362 : * With an offset of one, due to the always present pgwin32_signal_event,
1363 : * the handle offset directly corresponds to a wait event.
1364 : */
1365 : cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1366 :
1367 : occurred_events->pos = cur_event->pos;
1368 : occurred_events->user_data = cur_event->user_data;
1369 : occurred_events->events = 0;
1370 :
1371 : if (cur_event->events == WL_LATCH_SET)
1372 : {
1373 : if (!ResetEvent(set->latch->event))
1374 : elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1375 :
1376 : if (set->latch->is_set)
1377 : {
1378 : occurred_events->fd = PGINVALID_SOCKET;
1379 : occurred_events->events = WL_LATCH_SET;
1380 : occurred_events++;
1381 : returned_events++;
1382 : }
1383 : }
1384 : else if (cur_event->events == WL_POSTMASTER_DEATH)
1385 : {
1386 : /*
1387 : * Postmaster apparently died. Since the consequences of falsely
1388 : * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1389 : * the trouble to positively verify this with PostmasterIsAlive(),
1390 : * even though there is no known reason to think that the event could
1391 : * be falsely set on Windows.
1392 : */
1393 : if (!PostmasterIsAlive())
1394 : {
1395 : occurred_events->fd = PGINVALID_SOCKET;
1396 : occurred_events->events = WL_POSTMASTER_DEATH;
1397 : occurred_events++;
1398 : returned_events++;
1399 : }
1400 : }
1401 : else if (cur_event->events & WL_SOCKET_MASK)
1402 : {
1403 : WSANETWORKEVENTS resEvents;
1404 : HANDLE handle = set->handles[cur_event->pos + 1];
1405 :
1406 : Assert(cur_event->fd);
1407 :
1408 : occurred_events->fd = cur_event->fd;
1409 :
1410 : ZeroMemory(&resEvents, sizeof(resEvents));
1411 : if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1412 : elog(ERROR, "failed to enumerate network events: error code %u",
1413 : WSAGetLastError());
1414 : if ((cur_event->events & WL_SOCKET_READABLE) &&
1415 : (resEvents.lNetworkEvents & FD_READ))
1416 : {
1417 : /* data available in socket */
1418 : occurred_events->events |= WL_SOCKET_READABLE;
1419 :
1420 : /*------
1421 : * WaitForMultipleObjects doesn't guarantee that a read event will
1422 : * be returned if the latch is set at the same time. Even if it
1423 : * did, the caller might drop that event expecting it to reoccur
1424 : * on next call. So, we must force the event to be reset if this
1425 : * WaitEventSet is used again in order to avoid an indefinite
1426 : * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1427 : * for the behavior of socket events.
1428 : *------
1429 : */
1430 : cur_event->reset = true;
1431 : }
1432 : if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1433 : (resEvents.lNetworkEvents & FD_WRITE))
1434 : {
1435 : /* writeable */
1436 : occurred_events->events |= WL_SOCKET_WRITEABLE;
1437 : }
1438 : if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1439 : (resEvents.lNetworkEvents & FD_CONNECT))
1440 : {
1441 : /* connected */
1442 : occurred_events->events |= WL_SOCKET_CONNECTED;
1443 : }
1444 : if (resEvents.lNetworkEvents & FD_CLOSE)
1445 : {
1446 : /* EOF/error, so signal all caller-requested socket flags */
1447 : occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1448 : }
1449 :
1450 : if (occurred_events->events != 0)
1451 : {
1452 : occurred_events++;
1453 : returned_events++;
1454 : }
1455 : }
1456 :
1457 : return returned_events;
1458 : }
1459 : #endif
1460 :
1461 : /*
1462 : * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1463 : *
1464 : * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1465 : * overloaded for multiple purposes; or we might not have reached WaitLatch
1466 : * yet, in which case we don't need to fill the pipe either.)
1467 : *
1468 : * NB: when calling this in a signal handler, be sure to save and restore
1469 : * errno around it.
1470 : */
1471 : #ifndef WIN32
1472 : void
1473 1767 : latch_sigusr1_handler(void)
1474 : {
1475 1767 : if (waiting)
1476 1583 : sendSelfPipeByte();
1477 1767 : }
1478 : #endif /* !WIN32 */
1479 :
1480 : /* Send one byte to the self-pipe, to wake up WaitLatch */
1481 : #ifndef WIN32
1482 : static void
1483 2045 : sendSelfPipeByte(void)
1484 : {
1485 : int rc;
1486 2045 : char dummy = 0;
1487 :
1488 : retry:
1489 2045 : rc = write(selfpipe_writefd, &dummy, 1);
1490 2045 : if (rc < 0)
1491 : {
1492 : /* If interrupted by signal, just retry */
1493 0 : if (errno == EINTR)
1494 0 : goto retry;
1495 :
1496 : /*
1497 : * If the pipe is full, we don't need to retry, the data that's there
1498 : * already is enough to wake up WaitLatch.
1499 : */
1500 0 : if (errno == EAGAIN || errno == EWOULDBLOCK)
1501 0 : return;
1502 :
1503 : /*
1504 : * Oops, the write() failed for some other reason. We might be in a
1505 : * signal handler, so it's not safe to elog(). We have no choice but
1506 : * silently ignore the error.
1507 : */
1508 0 : return;
1509 : }
1510 : }
1511 : #endif /* !WIN32 */
1512 :
1513 : /*
1514 : * Read all available data from the self-pipe
1515 : *
1516 : * Note: this is only called when waiting = true. If it fails and doesn't
1517 : * return, it must reset that flag first (though ideally, this will never
1518 : * happen).
1519 : */
1520 : #ifndef WIN32
1521 : static void
1522 1547 : drainSelfPipe(void)
1523 : {
1524 : /*
1525 : * There shouldn't normally be more than one byte in the pipe, or maybe a
1526 : * few bytes if multiple processes run SetLatch at the same instant.
1527 : */
1528 : char buf[16];
1529 : int rc;
1530 :
1531 : for (;;)
1532 : {
1533 1547 : rc = read(selfpipe_readfd, buf, sizeof(buf));
1534 1547 : if (rc < 0)
1535 : {
1536 0 : if (errno == EAGAIN || errno == EWOULDBLOCK)
1537 : break; /* the pipe is empty */
1538 0 : else if (errno == EINTR)
1539 0 : continue; /* retry */
1540 : else
1541 : {
1542 0 : waiting = false;
1543 0 : elog(ERROR, "read() on self-pipe failed: %m");
1544 : }
1545 : }
1546 1547 : else if (rc == 0)
1547 : {
1548 0 : waiting = false;
1549 0 : elog(ERROR, "unexpected EOF on self-pipe");
1550 : }
1551 1547 : else if (rc < sizeof(buf))
1552 : {
1553 : /* we successfully drained the pipe; no need to read() again */
1554 1547 : break;
1555 : }
1556 : /* else buffer wasn't big enough, so read again */
1557 0 : }
1558 1547 : }
1559 : #endif /* !WIN32 */
|