redict/src/ae_kqueue.c
sundb 306a5ccd2d
Fix the timing of read and write events under kqueue (#9416)
Normally we execute the read event first and then the write event.
When the barrier is set, we will do it reverse.
However, under `kqueue`, if an `fd` has both read and write events,
reading the event using `kevent` will generate two events, which will
result in uncontrolled read and write timing.

This also means that the guarantees of AOF `appendfsync` = `always` are
not met on MacOS without this fix.

The main change to this pr is to cache the events already obtained when reading
them, so that if the same `fd` occurs again, only the mask in the cache is updated,
rather than a new event is generated.

This was exposed by the following test failure on MacOS:
```
*** [err]: AOF fsync always barrier issue in tests/integration/aof.tcl
Expected 544 != 544 (context: type eval line 26 cmd {assert {$size1 != $size2}} proc ::test)
```
2021-09-02 11:07:51 +03:00

191 lines
6.6 KiB
C

/* Kqueue(2)-based ae.c module
*
* Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
typedef struct aeApiState {
int kqfd;
struct kevent *events;
/* Events mask for merge read and write event.
* To reduce memory consumption, we use 2 bits to store the mask
* of an event, so that 1 byte will store the mask of 4 events. */
char *eventsMask;
} aeApiState;
#define EVENT_MASK_MALLOC_SIZE(sz) (((sz) + 3) / 4)
#define EVENT_MASK_OFFSET(fd) ((fd) % 4 * 2)
#define EVENT_MASK_ENCODE(fd, mask) (((mask) & 0x3) << EVENT_MASK_OFFSET(fd))
static inline int getEventMask(const char *eventsMask, int fd) {
return (eventsMask[fd/4] >> EVENT_MASK_OFFSET(fd)) & 0x3;
}
static inline void addEventMask(char *eventsMask, int fd, int mask) {
eventsMask[fd/4] |= EVENT_MASK_ENCODE(fd, mask);
}
static inline void resetEventMask(char *eventsMask, int fd) {
eventsMask[fd/4] &= ~EVENT_MASK_ENCODE(fd, 0x3);
}
static int aeApiCreate(aeEventLoop *eventLoop) {
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
state->events = zmalloc(sizeof(struct kevent)*eventLoop->setsize);
if (!state->events) {
zfree(state);
return -1;
}
state->kqfd = kqueue();
if (state->kqfd == -1) {
zfree(state->events);
zfree(state);
return -1;
}
anetCloexec(state->kqfd);
state->eventsMask = zmalloc(EVENT_MASK_MALLOC_SIZE(eventLoop->setsize));
memset(state->eventsMask, 0, EVENT_MASK_MALLOC_SIZE(eventLoop->setsize));
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
aeApiState *state = eventLoop->apidata;
state->events = zrealloc(state->events, sizeof(struct kevent)*setsize);
state->eventsMask = zrealloc(state->eventsMask, EVENT_MASK_MALLOC_SIZE(setsize));
memset(state->eventsMask, 0, EVENT_MASK_MALLOC_SIZE(setsize));
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
aeApiState *state = eventLoop->apidata;
close(state->kqfd);
zfree(state->events);
zfree(state->eventsMask);
zfree(state);
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct kevent ke;
if (mask & AE_READABLE) {
EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
}
if (mask & AE_WRITABLE) {
EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
}
return 0;
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct kevent ke;
if (mask & AE_READABLE) {
EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
}
if (mask & AE_WRITABLE) {
EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
}
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
if (tvp != NULL) {
struct timespec timeout;
timeout.tv_sec = tvp->tv_sec;
timeout.tv_nsec = tvp->tv_usec * 1000;
retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize,
&timeout);
} else {
retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize,
NULL);
}
if (retval > 0) {
int j;
/* Normally we execute the read event first and then the write event.
* When the barrier is set, we will do it reverse.
*
* However, under kqueue, read and write events would be separate
* events, which would make it impossible to control the order of
* reads and writes. So we store the event's mask we've got and merge
* the same fd events later. */
for (j = 0; j < retval; j++) {
struct kevent *e = state->events+j;
int fd = e->ident;
int mask = 0;
if (e->filter == EVFILT_READ) mask = AE_READABLE;
else if (e->filter == EVFILT_WRITE) mask = AE_WRITABLE;
addEventMask(state->eventsMask, fd, mask);
}
/* Re-traversal to merge read and write events, and set the fd's mask to
* 0 so that events are not added again when the fd is encountered again. */
numevents = 0;
for (j = 0; j < retval; j++) {
struct kevent *e = state->events+j;
int fd = e->ident;
int mask = getEventMask(state->eventsMask, fd);
if (mask) {
eventLoop->fired[numevents].fd = fd;
eventLoop->fired[numevents].mask = mask;
resetEventMask(state->eventsMask, fd);
numevents++;
}
}
} else if (retval == -1 && errno != EINTR) {
panic("aeApiPoll: kevent, %s", strerror(errno));
}
return numevents;
}
static char *aeApiName(void) {
return "kqueue";
}