From fe8352540fa5d6157648427b0651de9d5574e48d Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 12 Feb 2014 12:47:10 +0100
Subject: [PATCH] AOF: don't abort on write errors unless fsync is 'always'.

A system similar to the RDB write error handling is used, in which when
we can't write to the AOF file, writes are no longer accepted until we
are able to write again.

For fsync == always we still abort on errors since there is currently no
easy way to avoid replying with success to the user otherwise, and this
would violate the contract with the user of only acknowledging data
already secured on disk.
---
 src/aof.c   | 80 +++++++++++++++++++++++++++++++++++++++++++----------
 src/redis.c | 33 ++++++++++++++++------
 src/redis.h |  2 ++
 3 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index d59e4061f..dbe7bfa6d 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -226,6 +226,7 @@ int startAppendOnly(void) {
  *
  * However if force is set to 1 we'll write regardless of the background
  * fsync. */
+#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */
 void flushAppendOnlyFile(int force) {
     ssize_t nwritten;
     int sync_in_progress = 0;
@@ -267,27 +268,76 @@ void flushAppendOnlyFile(int force) {
      * or alike */
     nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
     if (nwritten != (signed)sdslen(server.aof_buf)) {
-        /* Ooops, we are in troubles. The best thing to do for now is
-         * aborting instead of giving the illusion that everything is
-         * working as expected. */
+        static time_t last_write_error_log = 0;
+        int can_log = 0;
+
+        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
+        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
+            can_log = 1;
+            last_write_error_log = server.unixtime;
+        }
+
+        /* Lof the AOF write error and record the error code. */
         if (nwritten == -1) {
-            redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
+            if (can_log) {
+                redisLog(REDIS_WARNING,"Error writing to the AOF file: %s",
+                    strerror(errno));
+                server.aof_last_write_errno = errno;
+            }
         } else {
-            redisLog(REDIS_WARNING,"Exiting on short write while writing to "
-                                   "the append-only file: %s (nwritten=%ld, "
-                                   "expected=%ld)",
-                                   strerror(errno),
-                                   (long)nwritten,
-                                   (long)sdslen(server.aof_buf));
+            if (can_log) {
+                redisLog(REDIS_WARNING,"Short write while writing to "
+                                       "the AOF file: (nwritten=%lld, "
+                                       "expected=%lld)",
+                                       (long long)nwritten,
+                                       (long long)sdslen(server.aof_buf));
+            }
 
             if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
-                redisLog(REDIS_WARNING, "Could not remove short write "
-                         "from the append-only file.  Redis may refuse "
-                         "to load the AOF the next time it starts.  "
-                         "ftruncate: %s", strerror(errno));
+                if (can_log) {
+                    redisLog(REDIS_WARNING, "Could not remove short write "
+                             "from the append-only file.  Redis may refuse "
+                             "to load the AOF the next time it starts.  "
+                             "ftruncate: %s", strerror(errno));
+                }
+            } else {
+                /* If the ftrunacate() succeeded we can set nwritten to
+                 * -1 since there is no longer partial data into the AOF. */
+                nwritten = -1;
             }
+            server.aof_last_write_errno = ENOSPC;
+        }
+
+        /* Handle the AOF write error. */
+        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
+            /* We can't recover when the fsync policy is ALWAYS since the
+             * reply for the client is already in the output buffers, and we
+             * have the contract with the user that on acknowledged write data
+             * is synched on disk. */
+            redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
+            exit(1);
+        } else {
+            /* Recover from failed write leaving data into the buffer. However
+             * set an error to stop accepting writes as long as the error
+             * condition is not cleared. */
+            server.aof_last_write_status = REDIS_ERR;
+
+            /* Trim the sds buffer if there was a partial write, and there
+             * was no way to undo it with ftruncate(2). */
+            if (nwritten > 0) {
+                server.aof_current_size += nwritten;
+                sdsrange(server.aof_buf,nwritten,-1);
+            }
+            return; /* We'll try again on the next call... */
+        }
+    } else {
+        /* Successful write(2). If AOF was in error state, restore the
+         * OK state and log the event. */
+        if (server.aof_last_write_status == REDIS_ERR) {
+            redisLog(REDIS_WARNING,
+                "AOF write error looks solved, Redis can write again.");
+            server.aof_last_write_status = REDIS_OK;
         }
-        exit(1);
     }
     server.aof_current_size += nwritten;
 
diff --git a/src/redis.c b/src/redis.c
index 059c43d47..5299c69d0 100644
--- a/src/redis.c
+++ b/src/redis.c
@@ -1167,9 +1167,13 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     }
 
 
-    /* If we postponed an AOF buffer flush, let's try to do it every time the
-     * cron function is called. */
-    if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
+    /* AOF: we may have postponed buffer flush, or were not able to
+     * write our buffer because of write(2) error. Try again here. */
+    if (server.aof_flush_postponed_start ||
+        server.aof_last_write_status == REDIS_ERR)
+    {
+        flushAppendOnlyFile(0);
+    }
 
     /* Close clients that need to be closed asynchronous */
     freeClientsInAsyncFreeQueue();
@@ -1670,6 +1674,8 @@ void initServer() {
     server.unixtime = time(NULL);
     server.mstime = mstime();
     server.lastbgsave_status = REDIS_OK;
+    server.aof_last_write_status = REDIS_OK;
+    server.aof_last_write_errno = 0;
     server.repl_good_slaves_count = 0;
 
     /* Create the serverCron() time event, that's our main way to process
@@ -2035,15 +2041,22 @@ int processCommand(redisClient *c) {
 
     /* Don't accept write commands if there are problems persisting on disk
      * and if this is a master instance. */
-    if (server.stop_writes_on_bgsave_err &&
-        server.saveparamslen > 0
-        && server.lastbgsave_status == REDIS_ERR &&
+    if (((server.stop_writes_on_bgsave_err &&
+          server.saveparamslen > 0 &&
+          server.lastbgsave_status == REDIS_ERR) ||
+          server.aof_last_write_status == REDIS_ERR) &&
         server.masterhost == NULL &&
         (c->cmd->flags & REDIS_CMD_WRITE ||
          c->cmd->proc == pingCommand))
     {
         flagTransaction(c);
-        addReply(c, shared.bgsaveerr);
+        if (server.aof_last_write_status == REDIS_OK)
+            addReply(c, shared.bgsaveerr);
+        else
+            addReplySds(c,
+                sdscatprintf(sdsempty(),
+                "-MISCONF Errors writing to the AOF file: %s\r\n",
+                strerror(server.aof_last_write_errno)));
         return REDIS_OK;
     }
 
@@ -2426,7 +2439,8 @@ sds genRedisInfoString(char *section) {
             "aof_rewrite_scheduled:%d\r\n"
             "aof_last_rewrite_time_sec:%jd\r\n"
             "aof_current_rewrite_time_sec:%jd\r\n"
-            "aof_last_bgrewrite_status:%s\r\n",
+            "aof_last_bgrewrite_status:%s\r\n"
+            "aof_last_write_status:%s\r\n",
             server.loading,
             server.dirty,
             server.rdb_child_pid != -1,
@@ -2441,7 +2455,8 @@ sds genRedisInfoString(char *section) {
             (intmax_t)server.aof_rewrite_time_last,
             (intmax_t)((server.aof_child_pid == -1) ?
                 -1 : time(NULL)-server.aof_rewrite_time_start),
-            (server.aof_lastbgrewrite_status == REDIS_OK) ? "ok" : "err");
+            (server.aof_lastbgrewrite_status == REDIS_OK) ? "ok" : "err",
+            (server.aof_last_write_status == REDIS_OK) ? "ok" : "err");
 
         if (server.aof_state != REDIS_AOF_OFF) {
             info = sdscatprintf(info,
diff --git a/src/redis.h b/src/redis.h
index 504d7947f..d7a9eea0a 100644
--- a/src/redis.h
+++ b/src/redis.h
@@ -691,6 +691,8 @@ struct redisServer {
     int aof_lastbgrewrite_status;   /* REDIS_OK or REDIS_ERR */
     unsigned long aof_delayed_fsync;  /* delayed AOF fsync() counter */
     int aof_rewrite_incremental_fsync;/* fsync incrementally while rewriting? */
+    int aof_last_write_status;      /* REDIS_OK or REDIS_ERR */
+    int aof_last_write_errno;       /* Valid if aof_last_write_status is ERR */
     /* RDB persistence */
     long long dirty;                /* Changes to DB from the last save */
     long long dirty_before_bgsave;  /* Used to restore dirty on failed BGSAVE */