/* ae.c module for illumos event ports. * * Copyright (c) 2012, Joyent, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Redis nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include static int evport_debug = 0; /* * This file implements the ae API using event ports, present on Solaris-based * systems since Solaris 10. Using the event port interface, we associate file * descriptors with the port. Each association also includes the set of poll(2) * events that the consumer is interested in (e.g., POLLIN and POLLOUT). * * There's one tricky piece to this implementation: when we return events via * aeApiPoll, the corresponding file descriptors become dissociated from the * port. This is necessary because poll events are level-triggered, so if the * fd didn't become dissociated, it would immediately fire another event since * the underlying state hasn't changed yet. We must re-associate the file * descriptor, but only after we know that our caller has actually read from it. * The ae API does not tell us exactly when that happens, but we do know that * it must happen by the time aeApiPoll is called again. Our solution is to * keep track of the last fds returned by aeApiPoll and re-associate them next * time aeApiPoll is invoked. * * To summarize, in this module, each fd association is EITHER (a) represented * only via the in-kernel association OR (b) represented by pending_fds and * pending_masks. (b) is only true for the last fds we returned from aeApiPoll, * and only until we enter aeApiPoll again (at which point we restore the * in-kernel association). */ #define MAX_EVENT_BATCHSZ 512 typedef struct aeApiState { int portfd; /* event port */ uint_t npending; /* # of pending fds */ int pending_fds[MAX_EVENT_BATCHSZ]; /* pending fds */ int pending_masks[MAX_EVENT_BATCHSZ]; /* pending fds' masks */ } aeApiState; static int aeApiCreate(aeEventLoop *eventLoop) { int i; aeApiState *state = zmalloc(sizeof(aeApiState)); if (!state) return -1; state->portfd = port_create(); if (state->portfd == -1) { zfree(state); return -1; } anetCloexec(state->portfd); state->npending = 0; for (i = 0; i < MAX_EVENT_BATCHSZ; i++) { state->pending_fds[i] = -1; state->pending_masks[i] = AE_NONE; } eventLoop->apidata = state; return 0; } static int aeApiResize(aeEventLoop *eventLoop, int setsize) { (void) eventLoop; (void) setsize; /* Nothing to resize here. */ return 0; } static void aeApiFree(aeEventLoop *eventLoop) { aeApiState *state = eventLoop->apidata; close(state->portfd); zfree(state); } static int aeApiLookupPending(aeApiState *state, int fd) { uint_t i; for (i = 0; i < state->npending; i++) { if (state->pending_fds[i] == fd) return (i); } return (-1); } /* * Helper function to invoke port_associate for the given fd and mask. */ static int aeApiAssociate(const char *where, int portfd, int fd, int mask) { int events = 0; int rv, err; if (mask & AE_READABLE) events |= POLLIN; if (mask & AE_WRITABLE) events |= POLLOUT; if (evport_debug) fprintf(stderr, "%s: port_associate(%d, 0x%x) = ", where, fd, events); rv = port_associate(portfd, PORT_SOURCE_FD, fd, events, (void *)(uintptr_t)mask); err = errno; if (evport_debug) fprintf(stderr, "%d (%s)\n", rv, rv == 0 ? "no error" : strerror(err)); if (rv == -1) { fprintf(stderr, "%s: port_associate: %s\n", where, strerror(err)); if (err == EAGAIN) fprintf(stderr, "aeApiAssociate: event port limit exceeded."); } return rv; } static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { aeApiState *state = eventLoop->apidata; int fullmask, pfd; if (evport_debug) fprintf(stderr, "aeApiAddEvent: fd %d mask 0x%x\n", fd, mask); /* * Since port_associate's "events" argument replaces any existing events, we * must be sure to include whatever events are already associated when * we call port_associate() again. */ fullmask = mask | eventLoop->events[fd].mask; pfd = aeApiLookupPending(state, fd); if (pfd != -1) { /* * This fd was recently returned from aeApiPoll. It should be safe to * assume that the consumer has processed that poll event, but we play * it safer by simply updating pending_mask. The fd will be * re-associated as usual when aeApiPoll is called again. */ if (evport_debug) fprintf(stderr, "aeApiAddEvent: adding to pending fd %d\n", fd); state->pending_masks[pfd] |= fullmask; return 0; } return (aeApiAssociate("aeApiAddEvent", state->portfd, fd, fullmask)); } static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { aeApiState *state = eventLoop->apidata; int fullmask, pfd; if (evport_debug) fprintf(stderr, "del fd %d mask 0x%x\n", fd, mask); pfd = aeApiLookupPending(state, fd); if (pfd != -1) { if (evport_debug) fprintf(stderr, "deleting event from pending fd %d\n", fd); /* * This fd was just returned from aeApiPoll, so it's not currently * associated with the port. All we need to do is update * pending_mask appropriately. */ state->pending_masks[pfd] &= ~mask; if (state->pending_masks[pfd] == AE_NONE) state->pending_fds[pfd] = -1; return; } /* * The fd is currently associated with the port. Like with the add case * above, we must look at the full mask for the file descriptor before * updating that association. We don't have a good way of knowing what the * events are without looking into the eventLoop state directly. We rely on * the fact that our caller has already updated the mask in the eventLoop. */ fullmask = eventLoop->events[fd].mask; if (fullmask == AE_NONE) { /* * We're removing *all* events, so use port_dissociate to remove the * association completely. Failure here indicates a bug. */ if (evport_debug) fprintf(stderr, "aeApiDelEvent: port_dissociate(%d)\n", fd); if (port_dissociate(state->portfd, PORT_SOURCE_FD, fd) != 0) { perror("aeApiDelEvent: port_dissociate"); abort(); /* will not return */ } } else if (aeApiAssociate("aeApiDelEvent", state->portfd, fd, fullmask) != 0) { /* * ENOMEM is a potentially transient condition, but the kernel won't * generally return it unless things are really bad. EAGAIN indicates * we've reached a resource limit, for which it doesn't make sense to * retry (counter-intuitively). All other errors indicate a bug. In any * of these cases, the best we can do is to abort. */ abort(); /* will not return */ } } static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { aeApiState *state = eventLoop->apidata; struct timespec timeout, *tsp; uint_t mask, i; uint_t nevents; port_event_t event[MAX_EVENT_BATCHSZ]; /* * If we've returned fd events before, we must re-associate them with the * port now, before calling port_get(). See the block comment at the top of * this file for an explanation of why. */ for (i = 0; i < state->npending; i++) { if (state->pending_fds[i] == -1) /* This fd has since been deleted. */ continue; if (aeApiAssociate("aeApiPoll", state->portfd, state->pending_fds[i], state->pending_masks[i]) != 0) { /* See aeApiDelEvent for why this case is fatal. */ abort(); } state->pending_masks[i] = AE_NONE; state->pending_fds[i] = -1; } state->npending = 0; if (tvp != NULL) { timeout.tv_sec = tvp->tv_sec; timeout.tv_nsec = tvp->tv_usec * 1000; tsp = &timeout; } else { tsp = NULL; } /* * port_getn can return with errno == ETIME having returned some events (!). * So if we get ETIME, we check nevents, too. */ nevents = 1; if (port_getn(state->portfd, event, MAX_EVENT_BATCHSZ, &nevents, tsp) == -1 && (errno != ETIME || nevents == 0)) { if (errno == ETIME || errno == EINTR) return 0; /* Any other error indicates a bug. */ perror("aeApiPoll: port_get"); abort(); } state->npending = nevents; for (i = 0; i < nevents; i++) { mask = 0; if (event[i].portev_events & POLLIN) mask |= AE_READABLE; if (event[i].portev_events & POLLOUT) mask |= AE_WRITABLE; eventLoop->fired[i].fd = event[i].portev_object; eventLoop->fired[i].mask = mask; if (evport_debug) fprintf(stderr, "aeApiPoll: fd %d mask 0x%x\n", (int)event[i].portev_object, mask); state->pending_fds[i] = event[i].portev_object; state->pending_masks[i] = (uintptr_t)event[i].portev_user; } return nevents; } static char *aeApiName(void) { return "evport"; }