diff --git a/src/Makefile b/src/Makefile
index c28f6f2b9..75d18f5e6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -345,7 +345,7 @@ endif
 
 REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX)
 REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX)
-REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
+REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
 REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX)
 REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX)
diff --git a/src/cluster.c b/src/cluster.c
index a4e9080b3..74d6a4d6d 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -1,1341 +1,8 @@
-/* Redis Cluster implementation.
- *
- * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   * Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *   * Neither the name of Redis nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without
- *     specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
 #include "server.h"
 #include "cluster.h"
-#include "endianconv.h"
-#include "connection.h"
 
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/file.h>
-#include <math.h>
 #include <ctype.h>
 
-/* A global reference to myself is handy to make code more clear.
- * Myself always points to server.cluster->myself, that is, the clusterNode
- * that represents this node. */
-clusterNode *myself = NULL;
-
-clusterNode *createClusterNode(char *nodename, int flags);
-void clusterAddNode(clusterNode *node);
-void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
-void clusterReadHandler(connection *conn);
-void clusterSendPing(clusterLink *link, int type);
-void clusterSendFail(char *nodename);
-void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request);
-void clusterUpdateState(void);
-int clusterNodeGetSlotBit(clusterNode *n, int slot);
-list *clusterGetNodesInMyShard(clusterNode *node);
-int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
-int clusterAddSlot(clusterNode *n, int slot);
-int clusterDelSlot(int slot);
-int clusterDelNodeSlots(clusterNode *node);
-int clusterNodeSetSlotBit(clusterNode *n, int slot);
-void clusterSetMaster(clusterNode *n);
-void clusterHandleSlaveFailover(void);
-void clusterHandleSlaveMigration(int max_slaves);
-int bitmapTestBit(unsigned char *bitmap, int pos);
-void bitmapSetBit(unsigned char *bitmap, int pos);
-void bitmapClearBit(unsigned char *bitmap, int pos);
-void clusterDoBeforeSleep(int flags);
-void clusterSendUpdate(clusterLink *link, clusterNode *node);
-void resetManualFailover(void);
-void clusterCloseAllSlots(void);
-void clusterSetNodeAsMaster(clusterNode *n);
-void clusterDelNode(clusterNode *delnode);
-sds representClusterNodeFlags(sds ci, uint16_t flags);
-sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count);
-void clusterFreeNodesSlotsInfo(clusterNode *n);
-uint64_t clusterGetMaxEpoch(void);
-int clusterBumpConfigEpochWithoutConsensus(void);
-void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len);
-const char *clusterGetMessageTypeString(int type);
-void removeChannelsInSlot(unsigned int slot);
-unsigned int countKeysInSlot(unsigned int hashslot);
-unsigned int countChannelsInSlot(unsigned int hashslot);
-unsigned int delKeysInSlot(unsigned int hashslot);
-void clusterAddNodeToShard(const char *shard_id, clusterNode *node);
-list *clusterLookupNodeListByShardId(const char *shard_id);
-void clusterRemoveNodeFromShard(clusterNode *node);
-int auxShardIdSetter(clusterNode *n, void *value, int length);
-sds auxShardIdGetter(clusterNode *n, sds s);
-int auxShardIdPresent(clusterNode *n);
-int auxHumanNodenameSetter(clusterNode *n, void *value, int length);
-sds auxHumanNodenameGetter(clusterNode *n, sds s);
-int auxHumanNodenamePresent(clusterNode *n);
-int auxTcpPortSetter(clusterNode *n, void *value, int length);
-sds auxTcpPortGetter(clusterNode *n, sds s);
-int auxTcpPortPresent(clusterNode *n);
-int auxTlsPortSetter(clusterNode *n, void *value, int length);
-sds auxTlsPortGetter(clusterNode *n, sds s);
-int auxTlsPortPresent(clusterNode *n);
-static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen);
-
-int getNodeDefaultClientPort(clusterNode *n) {
-    return server.tls_cluster ? n->tls_port : n->tcp_port;
-}
-
-static inline int getNodeDefaultReplicationPort(clusterNode *n) {
-    return server.tls_replication ? n->tls_port : n->tcp_port;
-}
-
-static inline int getNodeClientPort(clusterNode *n, int use_tls) {
-    return use_tls ? n->tls_port : n->tcp_port;
-}
-
-static inline int defaultClientPort(void) {
-    return server.tls_cluster ? server.tls_port : server.port;
-}
-
-/* When a cluster command is called, we need to decide whether to return TLS info or
- * non-TLS info by the client's connection type. However if the command is called by
- * a Lua script or RM_call, there is no connection in the fake client, so we use 
- * server.current_client here to get the real client if available. And if it is not 
- * available (modules may call commands without a real client), we return the default
- * info, which is determined by server.tls_cluster. */
-static int shouldReturnTlsInfo(void) {
-    if (server.current_client && server.current_client->conn) {
-        return connIsTLS(server.current_client->conn);
-    } else {
-        return server.tls_cluster;
-    }
-}
-
-/* Links to the next and previous entries for keys in the same slot are stored
- * in the dict entry metadata. See Slot to Key API below. */
-#define dictEntryNextInSlot(de) \
-    (((clusterDictEntryMetadata *)dictEntryMetadata(de))->next)
-#define dictEntryPrevInSlot(de) \
-    (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev)
-
-#define isSlotUnclaimed(slot) \
-    (server.cluster->slots[slot] == NULL || \
-        bitmapTestBit(server.cluster->owner_not_claiming_slot, slot))
-
-#define RCVBUF_INIT_LEN 1024
-#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */
-
-/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to
- * clusterNode structures. */
-dictType clusterNodesDictType = {
-        dictSdsHash,                /* hash function */
-        NULL,                       /* key dup */
-        NULL,                       /* val dup */
-        dictSdsKeyCompare,          /* key compare */
-        dictSdsDestructor,          /* key destructor */
-        NULL,                       /* val destructor */
-        NULL                        /* allow to expand */
-};
-
-/* Cluster re-addition blacklist. This maps node IDs to the time
- * we can re-add this node. The goal is to avoid reading a removed
- * node for some time. */
-dictType clusterNodesBlackListDictType = {
-        dictSdsCaseHash,            /* hash function */
-        NULL,                       /* key dup */
-        NULL,                       /* val dup */
-        dictSdsKeyCaseCompare,      /* key compare */
-        dictSdsDestructor,          /* key destructor */
-        NULL,                       /* val destructor */
-        NULL                        /* allow to expand */
-};
-
-static ConnectionType *connTypeOfCluster(void) {
-    if (server.tls_cluster) {
-        return connectionTypeTls();
-    }
-
-    return connectionTypeTcp();
-}
-/* Cluster shards hash table, mapping shard id to list of nodes */
-dictType clusterSdsToListType = {
-        dictSdsHash,                /* hash function */
-        NULL,                       /* key dup */
-        NULL,                       /* val dup */
-        dictSdsKeyCompare,          /* key compare */
-        dictSdsDestructor,          /* key destructor */
-        dictListDestructor,         /* val destructor */
-        NULL                        /* allow to expand */
-};
-
-/* Aux fields are introduced in Redis 7.2 to support the persistence
- * of various important node properties, such as shard id, in nodes.conf.
- * Aux fields take an explicit format of name=value pairs and have no
- * intrinsic order among them. Aux fields are always grouped together
- * at the end of the second column of each row after the node's IP
- * address/port/cluster_port and the optional hostname. Aux fields
- * are separated by ','. */
-
-/* Aux field setter function prototype
- * return C_OK when the update is successful; C_ERR otherwise */
-typedef int (aux_value_setter) (clusterNode* n, void *value, int length);
-/* Aux field getter function prototype
- * return an sds that is a concatenation of the input sds string and
- * the aux value */
-typedef sds (aux_value_getter) (clusterNode* n, sds s);
-
-typedef int (aux_value_present) (clusterNode* n);
-
-typedef struct {
-    char *field;
-    aux_value_setter *setter;
-    aux_value_getter *getter;
-    aux_value_present *isPresent;
-} auxFieldHandler;
-
-/* Assign index to each aux field */
-typedef enum {
-    af_shard_id,
-    af_human_nodename,
-    af_tcp_port,
-    af_tls_port,
-    af_count,
-} auxFieldIndex;
-
-/* Note that
- * 1. the order of the elements below must match that of their
- *    indices as defined in auxFieldIndex
- * 2. aux name can contain characters that pass the isValidAuxChar check only */
-auxFieldHandler auxFieldHandlers[] = {
-    {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent},
-    {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent},
-    {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent},
-    {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent},
-};
-
-int isValidAuxChar(int c) {
-    return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL);
-}
-
-int isValidAuxString(char *s, unsigned int length) {
-    for (unsigned i = 0; i < length; i++) {
-        if (!isValidAuxChar(s[i])) return 0;
-    }
-    return 1;
-}
-
-int auxShardIdSetter(clusterNode *n, void *value, int length) {
-    if (verifyClusterNodeId(value, length) == C_ERR) {
-        return C_ERR;
-    }
-    memcpy(n->shard_id, value, CLUSTER_NAMELEN);
-    /* if n already has replicas, make sure they all agree
-     * on the shard id */
-    for (int i = 0; i < n->numslaves; i++) {
-        if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
-            return C_ERR;
-        }
-    }
-    clusterAddNodeToShard(value, n);
-    return C_OK;
-}
-
-sds auxShardIdGetter(clusterNode *n, sds s) {
-    return sdscatprintf(s, "%.40s", n->shard_id);
-}
-
-int auxShardIdPresent(clusterNode *n) {
-    return strlen(n->shard_id);
-}
-
-int auxHumanNodenameSetter(clusterNode *n, void *value, int length) {
-    if (n && !strncmp(value, n->human_nodename, length)) {
-        return C_OK;
-    } else if (!n && (length == 0)) {
-        return C_OK;
-    }
-    if (n) {
-        n->human_nodename = sdscpylen(n->human_nodename, value, length);
-    } else if (sdslen(n->human_nodename) != 0) {
-        sdsclear(n->human_nodename);
-    } else {
-        return C_ERR;
-    }
-    return C_OK;
-}
-
-sds auxHumanNodenameGetter(clusterNode *n, sds s) {
-    return sdscatprintf(s, "%s", n->human_nodename);
-}
-
-int auxHumanNodenamePresent(clusterNode *n) {
-    return sdslen(n->human_nodename);
-}
-
-int auxTcpPortSetter(clusterNode *n, void *value, int length) {
-    if (length > 5 || length < 1) {
-        return C_ERR;
-    }
-    char buf[length + 1];
-    memcpy(buf, (char*)value, length);
-    buf[length] = '\0';
-    n->tcp_port = atoi(buf);
-    return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK;
-}
-
-sds auxTcpPortGetter(clusterNode *n, sds s) {
-    return sdscatprintf(s, "%d", n->tcp_port);
-}
-
-int auxTcpPortPresent(clusterNode *n) {
-    return n->tcp_port >= 0 && n->tcp_port < 65536;
-}
-
-int auxTlsPortSetter(clusterNode *n, void *value, int length) {
-    if (length > 5 || length < 1) {
-        return C_ERR;
-    }
-    char buf[length + 1];
-    memcpy(buf, (char*)value, length);
-    buf[length] = '\0';
-    n->tls_port = atoi(buf);
-    return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK;
-}
-
-sds auxTlsPortGetter(clusterNode *n, sds s) {
-    return sdscatprintf(s, "%d", n->tls_port);
-}
-
-int auxTlsPortPresent(clusterNode *n) {
-    return n->tls_port >= 0 && n->tls_port < 65536;
-}
-
-/* clusterLink send queue blocks */
-typedef struct {
-    size_t totlen; /* Total length of this block including the message */
-    int refcount;  /* Number of cluster link send msg queues containing the message */
-    clusterMsg msg;
-} clusterMsgSendBlock;
-
-/* -----------------------------------------------------------------------------
- * Initialization
- * -------------------------------------------------------------------------- */
-
-/* Load the cluster config from 'filename'.
- *
- * If the file does not exist or is zero-length (this may happen because
- * when we lock the nodes.conf file, we create a zero-length one for the
- * sake of locking if it does not already exist), C_ERR is returned.
- * If the configuration was loaded from the file, C_OK is returned. */
-int clusterLoadConfig(char *filename) {
-    FILE *fp = fopen(filename,"r");
-    struct stat sb;
-    char *line;
-    int maxline, j;
-
-    if (fp == NULL) {
-        if (errno == ENOENT) {
-            return C_ERR;
-        } else {
-            serverLog(LL_WARNING,
-                "Loading the cluster node config from %s: %s",
-                filename, strerror(errno));
-            exit(1);
-        }
-    }
-
-    if (redis_fstat(fileno(fp),&sb) == -1) {
-        serverLog(LL_WARNING,
-            "Unable to obtain the cluster node config file stat %s: %s",
-            filename, strerror(errno));
-        exit(1);
-    }
-    /* Check if the file is zero-length: if so return C_ERR to signal
-     * we have to write the config. */
-    if (sb.st_size == 0) {
-        fclose(fp);
-        return C_ERR;
-    }
-
-    /* Parse the file. Note that single lines of the cluster config file can
-     * be really long as they include all the hash slots of the node.
-     * This means in the worst possible case, half of the Redis slots will be
-     * present in a single line, possibly in importing or migrating state, so
-     * together with the node ID of the sender/receiver.
-     *
-     * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */
-    maxline = 1024+CLUSTER_SLOTS*128;
-    line = zmalloc(maxline);
-    while(fgets(line,maxline,fp) != NULL) {
-        int argc, aux_argc;
-        sds *argv, *aux_argv;
-        clusterNode *n, *master;
-        char *p, *s;
-
-        /* Skip blank lines, they can be created either by users manually
-         * editing nodes.conf or by the config writing process if stopped
-         * before the truncate() call. */
-        if (line[0] == '\n' || line[0] == '\0') continue;
-
-        /* Split the line into arguments for processing. */
-        argv = sdssplitargs(line,&argc);
-        if (argv == NULL) goto fmterr;
-
-        /* Handle the special "vars" line. Don't pretend it is the last
-         * line even if it actually is when generated by Redis. */
-        if (strcasecmp(argv[0],"vars") == 0) {
-            if (!(argc % 2)) goto fmterr;
-            for (j = 1; j < argc; j += 2) {
-                if (strcasecmp(argv[j],"currentEpoch") == 0) {
-                    server.cluster->currentEpoch =
-                            strtoull(argv[j+1],NULL,10);
-                } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) {
-                    server.cluster->lastVoteEpoch =
-                            strtoull(argv[j+1],NULL,10);
-                } else {
-                    serverLog(LL_NOTICE,
-                        "Skipping unknown cluster config variable '%s'",
-                        argv[j]);
-                }
-            }
-            sdsfreesplitres(argv,argc);
-            continue;
-        }
-
-        /* Regular config lines have at least eight fields */
-        if (argc < 8) {
-            sdsfreesplitres(argv,argc);
-            goto fmterr;
-        }
-
-        /* Create this node if it does not exist */
-        if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) {
-            sdsfreesplitres(argv, argc);
-            goto fmterr;
-        }
-        n = clusterLookupNode(argv[0], sdslen(argv[0]));
-        if (!n) {
-            n = createClusterNode(argv[0],0);
-            clusterAddNode(n);
-        }
-        /* Format for the node address and auxiliary argument information:
-         * ip:port[@cport][,hostname][,aux=val]*] */
-
-        aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc);
-        if (aux_argv == NULL) {
-            sdsfreesplitres(argv,argc);
-            goto fmterr;
-        }
-
-        /* Hostname is an optional argument that defines the endpoint
-         * that can be reported to clients instead of IP. */
-        if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) {
-            n->hostname = sdscpy(n->hostname, aux_argv[1]);
-        } else if (sdslen(n->hostname) != 0) {
-            sdsclear(n->hostname);
-        }
-
-        /* All fields after hostname are auxiliary and they take on
-         * the format of "aux=val" where both aux and val can contain
-         * characters that pass the isValidAuxChar check only. The order
-         * of the aux fields is insignificant. */
-        int aux_tcp_port = 0;
-        int aux_tls_port = 0;
-        for (int i = 2; i < aux_argc; i++) {
-            int field_argc;
-            sds *field_argv;
-            field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc);
-            if (field_argv == NULL || field_argc != 2) {
-                /* Invalid aux field format */
-                if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc);
-                sdsfreesplitres(argv,argc);
-                goto fmterr;
-            }
-
-            /* Validate that both aux and value contain valid characters only */
-            for (unsigned j = 0; j < 2; j++) {
-                if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){
-                    /* Invalid aux field format */
-                    sdsfreesplitres(field_argv, field_argc);
-                    sdsfreesplitres(argv,argc);
-                    goto fmterr;
-                }
-            }
-
-            /* Note that we don't expect lots of aux fields in the foreseeable
-             * future so a linear search is completely fine. */
-            int field_found = 0;
-            for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) {
-                if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) ||
-                    memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) {
-                    continue;
-                }
-                field_found = 1;
-                aux_tcp_port |= j == af_tcp_port;
-                aux_tls_port |= j == af_tls_port;
-                if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) {
-                    /* Invalid aux field format */
-                    sdsfreesplitres(field_argv, field_argc);
-                    sdsfreesplitres(argv,argc);
-                    goto fmterr;
-                }
-            }
-
-            if (field_found == 0) {
-                /* Invalid aux field format */
-                sdsfreesplitres(field_argv, field_argc);
-                sdsfreesplitres(argv,argc);
-                goto fmterr;
-            }
-
-            sdsfreesplitres(field_argv, field_argc);
-        }
-        /* Address and port */
-        if ((p = strrchr(aux_argv[0],':')) == NULL) {
-            sdsfreesplitres(aux_argv, aux_argc);
-            sdsfreesplitres(argv,argc);
-            goto fmterr;
-        }
-        *p = '\0';
-        memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1);
-        char *port = p+1;
-        char *busp = strchr(port,'@');
-        if (busp) {
-            *busp = '\0';
-            busp++;
-        }
-        /* If neither TCP or TLS port is found in aux field, it is considered
-         * an old version of nodes.conf file.*/
-        if (!aux_tcp_port && !aux_tls_port) {
-            if (server.tls_cluster) {
-                n->tls_port = atoi(port);
-            } else {
-                n->tcp_port = atoi(port);
-            }
-        } else if (!aux_tcp_port) {
-            n->tcp_port = atoi(port);
-        } else if (!aux_tls_port) {
-            n->tls_port = atoi(port);
-        }
-        /* In older versions of nodes.conf the "@busport" part is missing.
-         * In this case we set it to the default offset of 10000 from the
-         * base port. */
-        n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR);
-
-        /* The plaintext port for client in a TLS cluster (n->pport) is not
-         * stored in nodes.conf. It is received later over the bus protocol. */
-
-        sdsfreesplitres(aux_argv, aux_argc);
-
-        /* Parse flags */
-        p = s = argv[2];
-        while(p) {
-            p = strchr(s,',');
-            if (p) *p = '\0';
-            if (!strcasecmp(s,"myself")) {
-                serverAssert(server.cluster->myself == NULL);
-                myself = server.cluster->myself = n;
-                n->flags |= CLUSTER_NODE_MYSELF;
-            } else if (!strcasecmp(s,"master")) {
-                n->flags |= CLUSTER_NODE_MASTER;
-            } else if (!strcasecmp(s,"slave")) {
-                n->flags |= CLUSTER_NODE_SLAVE;
-            } else if (!strcasecmp(s,"fail?")) {
-                n->flags |= CLUSTER_NODE_PFAIL;
-            } else if (!strcasecmp(s,"fail")) {
-                n->flags |= CLUSTER_NODE_FAIL;
-                n->fail_time = mstime();
-            } else if (!strcasecmp(s,"handshake")) {
-                n->flags |= CLUSTER_NODE_HANDSHAKE;
-            } else if (!strcasecmp(s,"noaddr")) {
-                n->flags |= CLUSTER_NODE_NOADDR;
-            } else if (!strcasecmp(s,"nofailover")) {
-                n->flags |= CLUSTER_NODE_NOFAILOVER;
-            } else if (!strcasecmp(s,"noflags")) {
-                /* nothing to do */
-            } else {
-                serverPanic("Unknown flag in redis cluster config file");
-            }
-            if (p) s = p+1;
-        }
-
-        /* Get master if any. Set the master and populate master's
-         * slave list. */
-        if (argv[3][0] != '-') {
-            if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) {
-                sdsfreesplitres(argv, argc);
-                goto fmterr;
-            }
-            master = clusterLookupNode(argv[3], sdslen(argv[3]));
-            if (!master) {
-                master = createClusterNode(argv[3],0);
-                clusterAddNode(master);
-            }
-            /* shard_id can be absent if we are loading a nodes.conf generated
-             * by an older version of Redis; we should follow the primary's
-             * shard_id in this case */
-            if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) {
-                memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN);
-                clusterAddNodeToShard(master->shard_id, n);
-            } else if (clusterGetNodesInMyShard(master) != NULL &&
-                       memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0)
-            {
-                /* If the primary has been added to a shard, make sure this
-                 * node has the same persisted shard id as the primary. */
-                goto fmterr;
-            }
-            n->slaveof = master;
-            clusterNodeAddSlave(master,n);
-        } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) {
-            /* n is a primary but it does not have a persisted shard_id.
-             * This happens if we are loading a nodes.conf generated by
-             * an older version of Redis. We should manually update the
-             * shard membership in this case */
-            clusterAddNodeToShard(n->shard_id, n);
-        }
-
-        /* Set ping sent / pong received timestamps */
-        if (atoi(argv[4])) n->ping_sent = mstime();
-        if (atoi(argv[5])) n->pong_received = mstime();
-
-        /* Set configEpoch for this node.
-         * If the node is a replica, set its config epoch to 0.
-         * If it's a primary, load the config epoch from the configuration file. */
-        n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10);
-
-        /* Populate hash slots served by this instance. */
-        for (j = 8; j < argc; j++) {
-            int start, stop;
-
-            if (argv[j][0] == '[') {
-                /* Here we handle migrating / importing slots */
-                int slot;
-                char direction;
-                clusterNode *cn;
-
-                p = strchr(argv[j],'-');
-                serverAssert(p != NULL);
-                *p = '\0';
-                direction = p[1]; /* Either '>' or '<' */
-                slot = atoi(argv[j]+1);
-                if (slot < 0 || slot >= CLUSTER_SLOTS) {
-                    sdsfreesplitres(argv,argc);
-                    goto fmterr;
-                }
-                p += 3;
-
-                char *pr = strchr(p, ']');
-                size_t node_len = pr - p;
-                if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) {
-                    sdsfreesplitres(argv, argc);
-                    goto fmterr;
-                }
-                cn = clusterLookupNode(p, CLUSTER_NAMELEN);
-                if (!cn) {
-                    cn = createClusterNode(p,0);
-                    clusterAddNode(cn);
-                }
-                if (direction == '>') {
-                    server.cluster->migrating_slots_to[slot] = cn;
-                } else {
-                    server.cluster->importing_slots_from[slot] = cn;
-                }
-                continue;
-            } else if ((p = strchr(argv[j],'-')) != NULL) {
-                *p = '\0';
-                start = atoi(argv[j]);
-                stop = atoi(p+1);
-            } else {
-                start = stop = atoi(argv[j]);
-            }
-            if (start < 0 || start >= CLUSTER_SLOTS ||
-                stop < 0 || stop >= CLUSTER_SLOTS)
-            {
-                sdsfreesplitres(argv,argc);
-                goto fmterr;
-            }
-            while(start <= stop) clusterAddSlot(n, start++);
-        }
-
-        sdsfreesplitres(argv,argc);
-    }
-    /* Config sanity check */
-    if (server.cluster->myself == NULL) goto fmterr;
-
-    zfree(line);
-    fclose(fp);
-
-    serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name);
-
-    /* Something that should never happen: currentEpoch smaller than
-     * the max epoch found in the nodes configuration. However we handle this
-     * as some form of protection against manual editing of critical files. */
-    if (clusterGetMaxEpoch() > server.cluster->currentEpoch) {
-        server.cluster->currentEpoch = clusterGetMaxEpoch();
-    }
-    return C_OK;
-
-fmterr:
-    serverLog(LL_WARNING,
-        "Unrecoverable error: corrupted cluster config file \"%s\".", line);
-    zfree(line);
-    if (fp) fclose(fp);
-    exit(1);
-}
-
-/* Cluster node configuration is exactly the same as CLUSTER NODES output.
- *
- * This function writes the node config and returns 0, on error -1
- * is returned.
- *
- * Note: we need to write the file in an atomic way from the point of view
- * of the POSIX filesystem semantics, so that if the server is stopped
- * or crashes during the write, we'll end with either the old file or the
- * new one. Since we have the full payload to write available we can use
- * a single write to write the whole file. If the pre-existing file was
- * bigger we pad our payload with newlines that are anyway ignored and truncate
- * the file afterward. */
-int clusterSaveConfig(int do_fsync) {
-    sds ci,tmpfilename;
-    size_t content_size,offset = 0;
-    ssize_t written_bytes;
-    int fd = -1;
-    int retval = C_ERR;
-
-    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
-
-    /* Get the nodes description and concatenate our "vars" directive to
-     * save currentEpoch and lastVoteEpoch. */
-    ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0);
-    ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n",
-        (unsigned long long) server.cluster->currentEpoch,
-        (unsigned long long) server.cluster->lastVoteEpoch);
-    content_size = sdslen(ci);
-
-    /* Create a temp file with the new content. */
-    tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I",
-        server.cluster_configfile,(int) getpid(),mstime());
-    if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) {
-        serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno));
-        goto cleanup;
-    }
-
-    while (offset < content_size) {
-        written_bytes = write(fd,ci + offset,content_size - offset);
-        if (written_bytes <= 0) {
-            if (errno == EINTR) continue;
-            serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s",
-                offset,strerror(errno));
-            goto cleanup;
-        }
-        offset += written_bytes;
-    }
-
-    if (do_fsync) {
-        server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
-        if (redis_fsync(fd) == -1) {
-            serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno));
-            goto cleanup;
-        }
-    }
-
-    if (rename(tmpfilename, server.cluster_configfile) == -1) {
-        serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno));
-        goto cleanup;
-    }
-
-    if (do_fsync) {
-        if (fsyncFileDir(server.cluster_configfile) == -1) {
-            serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno));
-            goto cleanup;
-        }
-    }
-    retval = C_OK; /* If we reached this point, everything is fine. */
-
-cleanup:
-    if (fd != -1) close(fd);
-    if (retval) unlink(tmpfilename);
-    sdsfree(tmpfilename);
-    sdsfree(ci);
-    return retval;
-}
-
-void clusterSaveConfigOrDie(int do_fsync) {
-    if (clusterSaveConfig(do_fsync) == -1) {
-        serverLog(LL_WARNING,"Fatal: can't update cluster config file.");
-        exit(1);
-    }
-}
-
-/* Lock the cluster config using flock(), and retain the file descriptor used to
- * acquire the lock so that the file will be locked as long as the process is up.
- *
- * This works because we always update nodes.conf with a new version
- * in-place, reopening the file, and writing to it in place (later adjusting
- * the length with ftruncate()).
- *
- * On success C_OK is returned, otherwise an error is logged and
- * the function returns C_ERR to signal a lock was not acquired. */
-int clusterLockConfig(char *filename) {
-/* flock() does not exist on Solaris
- * and a fcntl-based solution won't help, as we constantly re-open that file,
- * which will release _all_ locks anyway
- */
-#if !defined(__sun)
-    /* To lock it, we need to open the file in a way it is created if
-     * it does not exist, otherwise there is a race condition with other
-     * processes. */
-    int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644);
-    if (fd == -1) {
-        serverLog(LL_WARNING,
-            "Can't open %s in order to acquire a lock: %s",
-            filename, strerror(errno));
-        return C_ERR;
-    }
-
-    if (flock(fd,LOCK_EX|LOCK_NB) == -1) {
-        if (errno == EWOULDBLOCK) {
-            serverLog(LL_WARNING,
-                 "Sorry, the cluster configuration file %s is already used "
-                 "by a different Redis Cluster node. Please make sure that "
-                 "different nodes use different cluster configuration "
-                 "files.", filename);
-        } else {
-            serverLog(LL_WARNING,
-                "Impossible to lock %s: %s", filename, strerror(errno));
-        }
-        close(fd);
-        return C_ERR;
-    }
-    /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that
-     * we'll retain the lock to the file as long as the process exists.
-     *
-     * After fork, the child process will get the fd opened by the parent process,
-     * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
-     * it will be closed in the child process.
-     * If it is not closed, when the main process is killed -9, but the child process
-     * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
-     * child process, and the main process will fail to get lock, means fail to start. */
-    server.cluster_config_file_lock_fd = fd;
-#else
-    UNUSED(filename);
-#endif /* __sun */
-
-    return C_OK;
-}
-
-/* Derives our ports to be announced in the cluster bus. */
-void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port,
-                          int *announced_cport) {
-    /* Config overriding announced ports. */
-    *announced_tcp_port = server.cluster_announce_port ? 
-                          server.cluster_announce_port : server.port;
-    *announced_tls_port = server.cluster_announce_tls_port ? 
-                          server.cluster_announce_tls_port : server.tls_port;
-    /* Derive cluster bus port. */
-    if (server.cluster_announce_bus_port) {
-        *announced_cport = server.cluster_announce_bus_port;
-    } else if (server.cluster_port) {
-        *announced_cport = server.cluster_port;
-    } else {
-        *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR;
-    }
-}
-
-/* Some flags (currently just the NOFAILOVER flag) may need to be updated
- * in the "myself" node based on the current configuration of the node,
- * that may change at runtime via CONFIG SET. This function changes the
- * set of flags in myself->flags accordingly. */
-void clusterUpdateMyselfFlags(void) {
-    if (!myself) return;
-    int oldflags = myself->flags;
-    int nofailover = server.cluster_slave_no_failover ?
-                     CLUSTER_NODE_NOFAILOVER : 0;
-    myself->flags &= ~CLUSTER_NODE_NOFAILOVER;
-    myself->flags |= nofailover;
-    if (myself->flags != oldflags) {
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_UPDATE_STATE);
-    }
-}
-
-
-/* We want to take myself->port/cport/pport in sync with the
-* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option.
-* The option can be set at runtime via CONFIG SET. */
-void clusterUpdateMyselfAnnouncedPorts(void) {
-    if (!myself) return;
-    deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport);
-}
-
-/* We want to take myself->ip in sync with the cluster-announce-ip option.
-* The option can be set at runtime via CONFIG SET. */
-void clusterUpdateMyselfIp(void) {
-    if (!myself) return;
-    static char *prev_ip = NULL;
-    char *curr_ip = server.cluster_announce_ip;
-    int changed = 0;
-
-    if (prev_ip == NULL && curr_ip != NULL) changed = 1;
-    else if (prev_ip != NULL && curr_ip == NULL) changed = 1;
-    else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1;
-
-    if (changed) {
-        if (prev_ip) zfree(prev_ip);
-        prev_ip = curr_ip;
-
-        if (curr_ip) {
-            /* We always take a copy of the previous IP address, by
-            * duplicating the string. This way later we can check if
-            * the address really changed. */
-            prev_ip = zstrdup(prev_ip);
-            redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN);
-        } else {
-            myself->ip[0] = '\0'; /* Force autodetection. */
-        }
-    }
-}
-
-/* Update the hostname for the specified node with the provided C string. */
-static void updateAnnouncedHostname(clusterNode *node, char *new) {
-    /* Previous and new hostname are the same, no need to update. */
-    if (new && !strcmp(new, node->hostname)) {
-        return;
-    } else if (!new && (sdslen(node->hostname) == 0)) {
-        return;
-    }
-
-    if (new) {
-        node->hostname = sdscpy(node->hostname, new);
-    } else if (sdslen(node->hostname) != 0) {
-        sdsclear(node->hostname);
-    }
-    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-}
-
-static void updateAnnouncedHumanNodename(clusterNode *node, char *new) {
-    if (new && !strcmp(new, node->human_nodename)) {
-        return;
-    } else if (!new && (sdslen(node->human_nodename) == 0)) {
-        return;
-    }
-    
-    if (new) {
-        node->human_nodename = sdscpy(node->human_nodename, new);
-    } else if (sdslen(node->human_nodename) != 0) {
-        sdsclear(node->human_nodename);
-    }
-    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-}
-
-
-static void updateShardId(clusterNode *node, const char *shard_id) {
-    if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) {
-        clusterRemoveNodeFromShard(node);
-        memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN);
-        clusterAddNodeToShard(shard_id, node);
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-    }
-    if (shard_id && myself != node && myself->slaveof == node) {
-        if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) {
-            /* shard-id can diverge right after a rolling upgrade
-             * from pre-7.2 releases */
-            clusterRemoveNodeFromShard(myself);
-            memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN);
-            clusterAddNodeToShard(shard_id, myself);
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG);
-        }
-    }
-}
-
-/* Update my hostname based on server configuration values */
-void clusterUpdateMyselfHostname(void) {
-    if (!myself) return;
-    updateAnnouncedHostname(myself, server.cluster_announce_hostname);
-}
-
-void clusterUpdateMyselfHumanNodename(void) {
-    if (!myself) return;
-    updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename);
-}
-
-void clusterInit(void) {
-    int saveconf = 0;
-
-    server.cluster = zmalloc(sizeof(clusterState));
-    server.cluster->myself = NULL;
-    server.cluster->currentEpoch = 0;
-    server.cluster->state = CLUSTER_FAIL;
-    server.cluster->size = 1;
-    server.cluster->todo_before_sleep = 0;
-    server.cluster->nodes = dictCreate(&clusterNodesDictType);
-    server.cluster->shards = dictCreate(&clusterSdsToListType);
-    server.cluster->nodes_black_list =
-        dictCreate(&clusterNodesBlackListDictType);
-    server.cluster->failover_auth_time = 0;
-    server.cluster->failover_auth_count = 0;
-    server.cluster->failover_auth_rank = 0;
-    server.cluster->failover_auth_epoch = 0;
-    server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
-    server.cluster->lastVoteEpoch = 0;
-
-    /* Initialize stats */
-    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
-        server.cluster->stats_bus_messages_sent[i] = 0;
-        server.cluster->stats_bus_messages_received[i] = 0;
-    }
-    server.cluster->stats_pfail_nodes = 0;
-    server.cluster->stat_cluster_links_buffer_limit_exceeded = 0;
-
-    memset(server.cluster->slots,0, sizeof(server.cluster->slots));
-    clusterCloseAllSlots();
-
-    memset(server.cluster->owner_not_claiming_slot, 0, sizeof(server.cluster->owner_not_claiming_slot));
-
-    /* Lock the cluster config file to make sure every node uses
-     * its own nodes.conf. */
-    server.cluster_config_file_lock_fd = -1;
-    if (clusterLockConfig(server.cluster_configfile) == C_ERR)
-        exit(1);
-
-    /* Load or create a new nodes configuration. */
-    if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
-        /* No configuration found. We will just use the random name provided
-         * by the createClusterNode() function. */
-        myself = server.cluster->myself =
-            createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER);
-        serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
-            myself->name);
-        clusterAddNode(myself);
-        clusterAddNodeToShard(myself->shard_id, myself);
-        saveconf = 1;
-    }
-    if (saveconf) clusterSaveConfigOrDie(1);
-
-    /* Port sanity check II
-     * The other handshake port check is triggered too late to stop
-     * us from trying to use a too-high cluster port number. */
-    int port = defaultClientPort();
-    if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) {
-        serverLog(LL_WARNING, "Redis port number too high. "
-                   "Cluster communication port is 10,000 port "
-                   "numbers higher than your Redis port. "
-                   "Your Redis port number must be 55535 or less.");
-        exit(1);
-    }
-    if (!server.bindaddr_count) {
-        serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus.");
-        exit(1);
-    }
-
-    /* The slots -> channels map is a radix tree. Initialize it here. */
-    server.cluster->slots_to_channels = raxNew();
-
-    /* Set myself->port/cport/pport to my listening ports, we'll just need to
-     * discover the IP address via MEET messages. */
-    deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport);
-
-    server.cluster->mf_end = 0;
-    server.cluster->mf_slave = NULL;
-    resetManualFailover();
-    clusterUpdateMyselfFlags();
-    clusterUpdateMyselfIp();
-    clusterUpdateMyselfHostname();
-    clusterUpdateMyselfHumanNodename();
-}
-
-void clusterInitListeners(void) {
-    if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) {
-        serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL));
-        exit(1);
-    }
-
-    int port = defaultClientPort();
-    connListener *listener = &server.clistener;
-    listener->count = 0;
-    listener->bindaddr = server.bindaddr;
-    listener->bindaddr_count = server.bindaddr_count;
-    listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR;
-    listener->ct = connTypeOfCluster();
-    if (connListen(listener) == C_ERR ) {
-        /* Note: the following log text is matched by the test suite. */
-        serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port);
-        exit(1);
-    }
-    
-    if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) {
-        serverPanic("Unrecoverable error creating Redis Cluster socket accept handler.");
-    }
-}
-
-/* Reset a node performing a soft or hard reset:
- *
- * 1) All other nodes are forgotten.
- * 2) All the assigned / open slots are released.
- * 3) If the node is a slave, it turns into a master.
- * 4) Only for hard reset: a new Node ID is generated.
- * 5) Only for hard reset: currentEpoch and configEpoch are set to 0.
- * 6) The new configuration is saved and the cluster state updated.
- * 7) If the node was a slave, the whole data set is flushed away. */
-void clusterReset(int hard) {
-    dictIterator *di;
-    dictEntry *de;
-    int j;
-
-    /* Turn into master. */
-    if (nodeIsSlave(myself)) {
-        clusterSetNodeAsMaster(myself);
-        replicationUnsetMaster();
-        emptyData(-1,EMPTYDB_NO_FLAGS,NULL);
-    }
-
-    /* Close slots, reset manual failover state. */
-    clusterCloseAllSlots();
-    resetManualFailover();
-
-    /* Unassign all the slots. */
-    for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j);
-
-    /* Recreate shards dict */
-    dictEmpty(server.cluster->shards, NULL);
-
-    /* Forget all the nodes, but myself. */
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (node == myself) continue;
-        clusterDelNode(node);
-    }
-    dictReleaseIterator(di);
-
-    /* Empty the nodes blacklist. */
-    dictEmpty(server.cluster->nodes_black_list, NULL);
-
-    /* Hard reset only: set epochs to 0, change node ID. */
-    if (hard) {
-        sds oldname;
-
-        server.cluster->currentEpoch = 0;
-        server.cluster->lastVoteEpoch = 0;
-        myself->configEpoch = 0;
-        serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD");
-
-        /* To change the Node ID we need to remove the old name from the
-         * nodes table, change the ID, and re-add back with new name. */
-        oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN);
-        dictDelete(server.cluster->nodes,oldname);
-        sdsfree(oldname);
-        getRandomHexChars(myself->name, CLUSTER_NAMELEN);
-        getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN);
-        clusterAddNode(myself);
-        serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name);
-    }
-
-    /* Re-populate shards */
-    clusterAddNodeToShard(myself->shard_id, myself);
-
-    /* Make sure to persist the new config and update the state. */
-    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                         CLUSTER_TODO_UPDATE_STATE|
-                         CLUSTER_TODO_FSYNC_CONFIG);
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER communication link
- * -------------------------------------------------------------------------- */
-static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) {
-    uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg);
-    clusterMsgSendBlock *msgblock = zcalloc(blocklen);
-    msgblock->refcount = 1;
-    msgblock->totlen = blocklen;
-    server.stat_cluster_links_memory += blocklen;
-    clusterBuildMessageHdr(&msgblock->msg,type,msglen);
-    return msgblock;
-}
-
-static void clusterMsgSendBlockDecrRefCount(void *node) {
-    clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node;
-    msgblock->refcount--;
-    serverAssert(msgblock->refcount >= 0);
-    if (msgblock->refcount == 0) {
-        server.stat_cluster_links_memory -= msgblock->totlen;
-        zfree(msgblock);
-    }
-}
-
-clusterLink *createClusterLink(clusterNode *node) {
-    clusterLink *link = zmalloc(sizeof(*link));
-    link->ctime = mstime();
-    link->send_msg_queue = listCreate();
-    listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount);
-    link->head_msg_send_offset = 0;
-    link->send_msg_queue_mem = sizeof(list);
-    link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
-    link->rcvbuf_len = 0;
-    server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem;
-    link->conn = NULL;
-    link->node = node;
-    /* Related node can only possibly be known at link creation time if this is an outbound link */
-    link->inbound = (node == NULL);
-    if (!link->inbound) {
-        node->link = link;
-    }
-    return link;
-}
-
-/* Free a cluster link, but does not free the associated node of course.
- * This function will just make sure that the original node associated
- * with this link will have the 'link' field set to NULL. */
-void freeClusterLink(clusterLink *link) {
-    if (link->conn) {
-        connClose(link->conn);
-        link->conn = NULL;
-    }
-    server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode);
-    listRelease(link->send_msg_queue);
-    server.stat_cluster_links_memory -= link->rcvbuf_alloc;
-    zfree(link->rcvbuf);
-    if (link->node) {
-        if (link->node->link == link) {
-            serverAssert(!link->inbound);
-            link->node->link = NULL;
-        } else if (link->node->inbound_link == link) {
-            serverAssert(link->inbound);
-            link->node->inbound_link = NULL;
-        }
-    }
-    zfree(link);
-}
-
-void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) {
-    serverAssert(!link->node);
-    serverAssert(link->inbound);
-    if (node->inbound_link) {
-        /* A peer may disconnect and then reconnect with us, and it's not guaranteed that
-         * we would always process the disconnection of the existing inbound link before
-         * accepting a new existing inbound link. Therefore, it's possible to have more than
-         * one inbound link from the same node at the same time. Our cleanup logic assumes
-         * a one to one relationship between nodes and inbound links, so we need to kill
-         * one of the links. The existing link is more likely the outdated one, but it's
-         * possible the other node may need to open another link. */
-        serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d",
-                node->inbound_link->conn->fd, node->name, link->conn->fd);
-        freeClusterLink(node->inbound_link);
-    }
-    serverAssert(!node->inbound_link);
-    node->inbound_link = link;
-    link->node = node;
-}
-
-static void clusterConnAcceptHandler(connection *conn) {
-    clusterLink *link;
-
-    if (connGetState(conn) != CONN_STATE_CONNECTED) {
-        serverLog(LL_VERBOSE,
-                "Error accepting cluster node connection: %s", connGetLastError(conn));
-        connClose(conn);
-        return;
-    }
-
-    /* Create a link object we use to handle the connection.
-     * It gets passed to the readable handler when data is available.
-     * Initially the link->node pointer is set to NULL as we don't know
-     * which node is, but the right node is references once we know the
-     * node identity. */
-    link = createClusterLink(NULL);
-    link->conn = conn;
-    connSetPrivateData(conn, link);
-
-    /* Register read handler */
-    connSetReadHandler(conn, clusterReadHandler);
-}
-
-#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000
-void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
-    int cport, cfd;
-    int max = MAX_CLUSTER_ACCEPTS_PER_CALL;
-    char cip[NET_IP_STR_LEN];
-    int require_auth = TLS_CLIENT_AUTH_YES;
-    UNUSED(el);
-    UNUSED(mask);
-    UNUSED(privdata);
-
-    /* If the server is starting up, don't accept cluster connections:
-     * UPDATE messages may interact with the database content. */
-    if (server.masterhost == NULL && server.loading) return;
-
-    while(max--) {
-        cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
-        if (cfd == ANET_ERR) {
-            if (errno != EWOULDBLOCK)
-                serverLog(LL_VERBOSE,
-                    "Error accepting cluster node: %s", server.neterr);
-            return;
-        }
-
-        connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth);
-
-        /* Make sure connection is not in an error state */
-        if (connGetState(conn) != CONN_STATE_ACCEPTING) {
-            serverLog(LL_VERBOSE,
-                "Error creating an accepting connection for cluster node: %s",
-                    connGetLastError(conn));
-            connClose(conn);
-            return;
-        }
-        connEnableTcpNoDelay(conn);
-        connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2);
-
-        /* Use non-blocking I/O for cluster messages. */
-        serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport);
-
-        /* Accept the connection now.  connAccept() may call our handler directly
-         * or schedule it for later depending on connection implementation.
-         */
-        if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) {
-            if (connGetState(conn) == CONN_STATE_ERROR)
-                serverLog(LL_VERBOSE,
-                        "Error accepting cluster node connection: %s",
-                        connGetLastError(conn));
-            connClose(conn);
-            return;
-        }
-    }
-}
-
-/* Return the approximated number of sockets we are using in order to
- * take the cluster bus connections. */
-unsigned long getClusterConnectionsCount(void) {
-    /* We decrement the number of nodes by one, since there is the
-     * "myself" node too in the list. Each node uses two file descriptors,
-     * one incoming and one outgoing, thus the multiplication by 2. */
-    return server.cluster_enabled ?
-           ((dictSize(server.cluster->nodes)-1)*2) : 0;
-}
-
 /* -----------------------------------------------------------------------------
  * Key space handling
  * -------------------------------------------------------------------------- */
@@ -1343,7 +10,7 @@ unsigned long getClusterConnectionsCount(void) {
 /* We have 16384 hash slots. The hash slot of a given key is obtained
  * as the least significant 14 bits of the crc16 of the key.
  *
- * However if the key contains the {...} pattern, only the part between
+ * However, if the key contains the {...} pattern, only the part between
  * { and } is hashed. This may be useful in the future to force certain
  * keys to be in the same node (assuming no resharding is in progress). */
 unsigned int keyHashSlot(char *key, int keylen) {
@@ -1397,5141 +64,12 @@ int patternHashSlot(char *pattern, int length) {
     return crc16(pattern, length) & 0x3FFF;
 }
 
-/* -----------------------------------------------------------------------------
- * CLUSTER node API
- * -------------------------------------------------------------------------- */
-
-/* Create a new cluster node, with the specified flags.
- * If "nodename" is NULL this is considered a first handshake and a random
- * node name is assigned to this node (it will be fixed later when we'll
- * receive the first pong).
- *
- * The node is created and returned to the user, but it is not automatically
- * added to the nodes hash table. */
-clusterNode *createClusterNode(char *nodename, int flags) {
-    clusterNode *node = zmalloc(sizeof(*node));
-
-    if (nodename)
-        memcpy(node->name, nodename, CLUSTER_NAMELEN);
-    else
-        getRandomHexChars(node->name, CLUSTER_NAMELEN);
-    getRandomHexChars(node->shard_id, CLUSTER_NAMELEN);
-    node->ctime = mstime();
-    node->configEpoch = 0;
-    node->flags = flags;
-    memset(node->slots,0,sizeof(node->slots));
-    node->slot_info_pairs = NULL;
-    node->slot_info_pairs_count = 0;
-    node->numslots = 0;
-    node->numslaves = 0;
-    node->slaves = NULL;
-    node->slaveof = NULL;
-    node->last_in_ping_gossip = 0;
-    node->ping_sent = node->pong_received = 0;
-    node->data_received = 0;
-    node->fail_time = 0;
-    node->link = NULL;
-    node->inbound_link = NULL;
-    memset(node->ip,0,sizeof(node->ip));
-    node->hostname = sdsempty();
-    node->human_nodename = sdsempty();
-    node->tcp_port = 0;
-    node->cport = 0;
-    node->tls_port = 0;
-    node->fail_reports = listCreate();
-    node->voted_time = 0;
-    node->orphaned_time = 0;
-    node->repl_offset_time = 0;
-    node->repl_offset = 0;
-    listSetFreeMethod(node->fail_reports,zfree);
-    return node;
-}
-
-/* This function is called every time we get a failure report from a node.
- * The side effect is to populate the fail_reports list (or to update
- * the timestamp of an existing report).
- *
- * 'failing' is the node that is in failure state according to the
- * 'sender' node.
- *
- * The function returns 0 if it just updates a timestamp of an existing
- * failure report from the same sender. 1 is returned if a new failure
- * report is created. */
-int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
-    list *l = failing->fail_reports;
-    listNode *ln;
-    listIter li;
-    clusterNodeFailReport *fr;
-
-    /* If a failure report from the same sender already exists, just update
-     * the timestamp. */
-    listRewind(l,&li);
-    while ((ln = listNext(&li)) != NULL) {
-        fr = ln->value;
-        if (fr->node == sender) {
-            fr->time = mstime();
-            return 0;
-        }
-    }
-
-    /* Otherwise create a new report. */
-    fr = zmalloc(sizeof(*fr));
-    fr->node = sender;
-    fr->time = mstime();
-    listAddNodeTail(l,fr);
-    return 1;
-}
-
-/* Remove failure reports that are too old, where too old means reasonably
- * older than the global node timeout. Note that anyway for a node to be
- * flagged as FAIL we need to have a local PFAIL state that is at least
- * older than the global node timeout, so we don't just trust the number
- * of failure reports from other nodes. */
-void clusterNodeCleanupFailureReports(clusterNode *node) {
-    list *l = node->fail_reports;
-    listNode *ln;
-    listIter li;
-    clusterNodeFailReport *fr;
-    mstime_t maxtime = server.cluster_node_timeout *
-                     CLUSTER_FAIL_REPORT_VALIDITY_MULT;
-    mstime_t now = mstime();
-
-    listRewind(l,&li);
-    while ((ln = listNext(&li)) != NULL) {
-        fr = ln->value;
-        if (now - fr->time > maxtime) listDelNode(l,ln);
-    }
-}
-
-/* Remove the failing report for 'node' if it was previously considered
- * failing by 'sender'. This function is called when a node informs us via
- * gossip that a node is OK from its point of view (no FAIL or PFAIL flags).
- *
- * Note that this function is called relatively often as it gets called even
- * when there are no nodes failing, and is O(N), however when the cluster is
- * fine the failure reports list is empty so the function runs in constant
- * time.
- *
- * The function returns 1 if the failure report was found and removed.
- * Otherwise 0 is returned. */
-int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
-    list *l = node->fail_reports;
-    listNode *ln;
-    listIter li;
-    clusterNodeFailReport *fr;
-
-    /* Search for a failure report from this sender. */
-    listRewind(l,&li);
-    while ((ln = listNext(&li)) != NULL) {
-        fr = ln->value;
-        if (fr->node == sender) break;
-    }
-    if (!ln) return 0; /* No failure report from this sender. */
-
-    /* Remove the failure report. */
-    listDelNode(l,ln);
-    clusterNodeCleanupFailureReports(node);
-    return 1;
-}
-
-/* Return the number of external nodes that believe 'node' is failing,
- * not including this node, that may have a PFAIL or FAIL state for this
- * node as well. */
-int clusterNodeFailureReportsCount(clusterNode *node) {
-    clusterNodeCleanupFailureReports(node);
-    return listLength(node->fail_reports);
-}
-
-int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {
-    int j;
-
-    for (j = 0; j < master->numslaves; j++) {
-        if (master->slaves[j] == slave) {
-            if ((j+1) < master->numslaves) {
-                int remaining_slaves = (master->numslaves - j) - 1;
-                memmove(master->slaves+j,master->slaves+(j+1),
-                        (sizeof(*master->slaves) * remaining_slaves));
-            }
-            master->numslaves--;
-            if (master->numslaves == 0)
-                master->flags &= ~CLUSTER_NODE_MIGRATE_TO;
-            return C_OK;
-        }
-    }
-    return C_ERR;
-}
-
-int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) {
-    int j;
-
-    /* If it's already a slave, don't add it again. */
-    for (j = 0; j < master->numslaves; j++)
-        if (master->slaves[j] == slave) return C_ERR;
-    master->slaves = zrealloc(master->slaves,
-        sizeof(clusterNode*)*(master->numslaves+1));
-    master->slaves[master->numslaves] = slave;
-    master->numslaves++;
-    master->flags |= CLUSTER_NODE_MIGRATE_TO;
-    return C_OK;
-}
-
-int clusterCountNonFailingSlaves(clusterNode *n) {
-    int j, okslaves = 0;
-
-    for (j = 0; j < n->numslaves; j++)
-        if (!nodeFailed(n->slaves[j])) okslaves++;
-    return okslaves;
-}
-
-/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
-void freeClusterNode(clusterNode *n) {
-    sds nodename;
-    int j;
-
-    /* If the node has associated slaves, we have to set
-     * all the slaves->slaveof fields to NULL (unknown). */
-    for (j = 0; j < n->numslaves; j++)
-        n->slaves[j]->slaveof = NULL;
-
-    /* Remove this node from the list of slaves of its master. */
-    if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
-
-    /* Unlink from the set of nodes. */
-    nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
-    serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
-    sdsfree(nodename);
-    sdsfree(n->hostname);
-    sdsfree(n->human_nodename);
-
-    /* Release links and associated data structures. */
-    if (n->link) freeClusterLink(n->link);
-    if (n->inbound_link) freeClusterLink(n->inbound_link);
-    listRelease(n->fail_reports);
-    zfree(n->slaves);
-    zfree(n);
-}
-
-/* Add a node to the nodes hash table */
-void clusterAddNode(clusterNode *node) {
-    int retval;
-
-    retval = dictAdd(server.cluster->nodes,
-            sdsnewlen(node->name,CLUSTER_NAMELEN), node);
-    serverAssert(retval == DICT_OK);
-}
-
-/* Remove a node from the cluster. The function performs the high level
- * cleanup, calling freeClusterNode() for the low level cleanup.
- * Here we do the following:
- *
- * 1) Mark all the slots handled by it as unassigned.
- * 2) Remove all the failure reports sent by this node and referenced by
- *    other nodes.
- * 3) Remove the node from the owning shard
- * 4) Free the node with freeClusterNode() that will in turn remove it
- *    from the hash table and from the list of slaves of its master, if
- *    it is a slave node.
- */
-void clusterDelNode(clusterNode *delnode) {
-    int j;
-    dictIterator *di;
-    dictEntry *de;
-
-    /* 1) Mark slots as unassigned. */
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (server.cluster->importing_slots_from[j] == delnode)
-            server.cluster->importing_slots_from[j] = NULL;
-        if (server.cluster->migrating_slots_to[j] == delnode)
-            server.cluster->migrating_slots_to[j] = NULL;
-        if (server.cluster->slots[j] == delnode)
-            clusterDelSlot(j);
-    }
-
-    /* 2) Remove failure reports. */
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (node == delnode) continue;
-        clusterNodeDelFailureReport(node,delnode);
-    }
-    dictReleaseIterator(di);
-
-    /* 3) Remove the node from the owning shard */
-    clusterRemoveNodeFromShard(delnode);
-
-    /* 4) Free the node, unlinking it from the cluster. */
-    freeClusterNode(delnode);
-}
-
-/* Cluster node sanity check. Returns C_OK if the node id
- * is valid an C_ERR otherwise. */
-int verifyClusterNodeId(const char *name, int length) {
-    if (length != CLUSTER_NAMELEN) return C_ERR;
-    for (int i = 0; i < length; i++) {
-        if (name[i] >= 'a' && name[i] <= 'z') continue;
-        if (name[i] >= '0' && name[i] <= '9') continue;
-        return C_ERR;
-    }
-    return C_OK;
-}
-
-/* Node lookup by name */
-clusterNode *clusterLookupNode(const char *name, int length) {
-    if (verifyClusterNodeId(name, length) != C_OK) return NULL;
-    sds s = sdsnewlen(name, length);
-    dictEntry *de = dictFind(server.cluster->nodes, s);
-    sdsfree(s);
-    if (de == NULL) return NULL;
-    return dictGetVal(de);
-}
-
-/* Get all the nodes in my shard.
- * Note that the list returned is not computed on the fly
- * via slaveof; rather, it is maintained permanently to
- * track the shard membership and its life cycle is tied
- * to this Redis process. Therefore, the caller must not
- * release the list. */
-list *clusterGetNodesInMyShard(clusterNode *node) {
-    sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(server.cluster->shards,s);
-    sdsfree(s);
-    return (de != NULL) ? dictGetVal(de) : NULL;
-}
-
-/* This is only used after the handshake. When we connect a given IP/PORT
- * as a result of CLUSTER MEET we don't have the node name yet, so we
- * pick a random one, and will fix it when we receive the PONG request using
- * this function. */
-void clusterRenameNode(clusterNode *node, char *newname) {
-    int retval;
-    sds s = sdsnewlen(node->name, CLUSTER_NAMELEN);
-
-    serverLog(LL_DEBUG,"Renaming node %.40s into %.40s",
-        node->name, newname);
-    retval = dictDelete(server.cluster->nodes, s);
-    sdsfree(s);
-    serverAssert(retval == DICT_OK);
-    memcpy(node->name, newname, CLUSTER_NAMELEN);
-    clusterAddNode(node);
-}
-
-void clusterAddNodeToShard(const char *shard_id, clusterNode *node) {
-    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(server.cluster->shards,s);
-    if (de == NULL) {
-        list *l = listCreate();
-        listAddNodeTail(l, node);
-        serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK);
-    } else {
-        list *l = dictGetVal(de);
-        if (listSearchKey(l, node) == NULL) {
-            listAddNodeTail(l, node);
-        }
-        sdsfree(s);
-    }
-}
-
-void clusterRemoveNodeFromShard(clusterNode *node) {
-    sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(server.cluster->shards, s);
-    if (de != NULL) {
-        list *l = dictGetVal(de);
-        listNode *ln = listSearchKey(l, node);
-        if (ln != NULL) {
-            listDelNode(l, ln);
-        }
-        if (listLength(l) == 0) {
-            dictDelete(server.cluster->shards, s);
-        }
-    }
-    sdsfree(s);
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER config epoch handling
- * -------------------------------------------------------------------------- */
-
-/* Return the greatest configEpoch found in the cluster, or the current
- * epoch if greater than any node configEpoch. */
-uint64_t clusterGetMaxEpoch(void) {
-    uint64_t max = 0;
-    dictIterator *di;
-    dictEntry *de;
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        if (node->configEpoch > max) max = node->configEpoch;
-    }
-    dictReleaseIterator(di);
-    if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch;
-    return max;
-}
-
-/* If this node epoch is zero or is not already the greatest across the
- * cluster (from the POV of the local configuration), this function will:
- *
- * 1) Generate a new config epoch, incrementing the current epoch.
- * 2) Assign the new epoch to this node, WITHOUT any consensus.
- * 3) Persist the configuration on disk before sending packets with the
- *    new configuration.
- *
- * If the new config epoch is generated and assigned, C_OK is returned,
- * otherwise C_ERR is returned (since the node has already the greatest
- * configuration around) and no operation is performed.
- *
- * Important note: this function violates the principle that config epochs
- * should be generated with consensus and should be unique across the cluster.
- * However Redis Cluster uses this auto-generated new config epochs in two
- * cases:
- *
- * 1) When slots are closed after importing. Otherwise resharding would be
- *    too expensive.
- * 2) When CLUSTER FAILOVER is called with options that force a slave to
- *    failover its master even if there is not master majority able to
- *    create a new configuration epoch.
- *
- * Redis Cluster will not explode using this function, even in the case of
- * a collision between this node and another node, generating the same
- * configuration epoch unilaterally, because the config epoch conflict
- * resolution algorithm will eventually move colliding nodes to different
- * config epochs. However using this function may violate the "last failover
- * wins" rule, so should only be used with care. */
-int clusterBumpConfigEpochWithoutConsensus(void) {
-    uint64_t maxEpoch = clusterGetMaxEpoch();
-
-    if (myself->configEpoch == 0 ||
-        myself->configEpoch != maxEpoch)
-    {
-        server.cluster->currentEpoch++;
-        myself->configEpoch = server.cluster->currentEpoch;
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_FSYNC_CONFIG);
-        serverLog(LL_NOTICE,
-            "New configEpoch set to %llu",
-            (unsigned long long) myself->configEpoch);
-        return C_OK;
-    } else {
-        return C_ERR;
-    }
-}
-
-/* This function is called when this node is a master, and we receive from
- * another master a configuration epoch that is equal to our configuration
- * epoch.
- *
- * BACKGROUND
- *
- * It is not possible that different slaves get the same config
- * epoch during a failover election, because the slaves need to get voted
- * by a majority. However when we perform a manual resharding of the cluster
- * the node will assign a configuration epoch to itself without to ask
- * for agreement. Usually resharding happens when the cluster is working well
- * and is supervised by the sysadmin, however it is possible for a failover
- * to happen exactly while the node we are resharding a slot to assigns itself
- * a new configuration epoch, but before it is able to propagate it.
- *
- * So technically it is possible in this condition that two nodes end with
- * the same configuration epoch.
- *
- * Another possibility is that there are bugs in the implementation causing
- * this to happen.
- *
- * Moreover when a new cluster is created, all the nodes start with the same
- * configEpoch. This collision resolution code allows nodes to automatically
- * end with a different configEpoch at startup automatically.
- *
- * In all the cases, we want a mechanism that resolves this issue automatically
- * as a safeguard. The same configuration epoch for masters serving different
- * set of slots is not harmful, but it is if the nodes end serving the same
- * slots for some reason (manual errors or software bugs) without a proper
- * failover procedure.
- *
- * In general we want a system that eventually always ends with different
- * masters having different configuration epochs whatever happened, since
- * nothing is worse than a split-brain condition in a distributed system.
- *
- * BEHAVIOR
- *
- * When this function gets called, what happens is that if this node
- * has the lexicographically smaller Node ID compared to the other node
- * with the conflicting epoch (the 'sender' node), it will assign itself
- * the greatest configuration epoch currently detected among nodes plus 1.
- *
- * This means that even if there are multiple nodes colliding, the node
- * with the greatest Node ID never moves forward, so eventually all the nodes
- * end with a different configuration epoch.
- */
-void clusterHandleConfigEpochCollision(clusterNode *sender) {
-    /* Prerequisites: nodes have the same configEpoch and are both masters. */
-    if (sender->configEpoch != myself->configEpoch ||
-        !nodeIsMaster(sender) || !nodeIsMaster(myself)) return;
-    /* Don't act if the colliding node has a smaller Node ID. */
-    if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return;
-    /* Get the next ID available at the best of this node knowledge. */
-    server.cluster->currentEpoch++;
-    myself->configEpoch = server.cluster->currentEpoch;
-    clusterSaveConfigOrDie(1);
-    serverLog(LL_VERBOSE,
-        "WARNING: configEpoch collision with node %.40s (%s)."
-        " configEpoch set to %llu",
-        sender->name,sender->human_nodename,
-        (unsigned long long) myself->configEpoch);
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER nodes blacklist
- *
- * The nodes blacklist is just a way to ensure that a given node with a given
- * Node ID is not re-added before some time elapsed (this time is specified
- * in seconds in CLUSTER_BLACKLIST_TTL).
- *
- * This is useful when we want to remove a node from the cluster completely:
- * when CLUSTER FORGET is called, it also puts the node into the blacklist so
- * that even if we receive gossip messages from other nodes that still remember
- * about the node we want to remove, we don't re-add it before some time.
- *
- * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
- * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes
- * in the cluster without dealing with the problem of other nodes re-adding
- * back the node to nodes we already sent the FORGET command to.
- *
- * The data structure used is a hash table with an sds string representing
- * the node ID as key, and the time when it is ok to re-add the node as
- * value.
- * -------------------------------------------------------------------------- */
-
-#define CLUSTER_BLACKLIST_TTL 60      /* 1 minute. */
-
-
-/* Before of the addNode() or Exists() operations we always remove expired
- * entries from the black list. This is an O(N) operation but it is not a
- * problem since add / exists operations are called very infrequently and
- * the hash table is supposed to contain very little elements at max.
- * However without the cleanup during long uptime and with some automated
- * node add/removal procedures, entries could accumulate. */
-void clusterBlacklistCleanup(void) {
-    dictIterator *di;
-    dictEntry *de;
-
-    di = dictGetSafeIterator(server.cluster->nodes_black_list);
-    while((de = dictNext(di)) != NULL) {
-        int64_t expire = dictGetUnsignedIntegerVal(de);
-
-        if (expire < server.unixtime)
-            dictDelete(server.cluster->nodes_black_list,dictGetKey(de));
-    }
-    dictReleaseIterator(di);
-}
-
-/* Cleanup the blacklist and add a new node ID to the black list. */
-void clusterBlacklistAddNode(clusterNode *node) {
-    dictEntry *de;
-    sds id = sdsnewlen(node->name,CLUSTER_NAMELEN);
-
-    clusterBlacklistCleanup();
-    if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) {
-        /* If the key was added, duplicate the sds string representation of
-         * the key for the next lookup. We'll free it at the end. */
-        id = sdsdup(id);
-    }
-    de = dictFind(server.cluster->nodes_black_list,id);
-    dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL);
-    sdsfree(id);
-}
-
-/* Return non-zero if the specified node ID exists in the blacklist.
- * You don't need to pass an sds string here, any pointer to 40 bytes
- * will work. */
-int clusterBlacklistExists(char *nodeid) {
-    sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN);
-    int retval;
-
-    clusterBlacklistCleanup();
-    retval = dictFind(server.cluster->nodes_black_list,id) != NULL;
-    sdsfree(id);
-    return retval;
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER messages exchange - PING/PONG and gossip
- * -------------------------------------------------------------------------- */
-
-/* This function checks if a given node should be marked as FAIL.
- * It happens if the following conditions are met:
- *
- * 1) We received enough failure reports from other master nodes via gossip.
- *    Enough means that the majority of the masters signaled the node is
- *    down recently.
- * 2) We believe this node is in PFAIL state.
- *
- * If a failure is detected we also inform the whole cluster about this
- * event trying to force every other node to set the FAIL flag for the node.
- *
- * Note that the form of agreement used here is weak, as we collect the majority
- * of masters state during some time, and even if we force agreement by
- * propagating the FAIL message, because of partitions we may not reach every
- * node. However:
- *
- * 1) Either we reach the majority and eventually the FAIL state will propagate
- *    to all the cluster.
- * 2) Or there is no majority so no slave promotion will be authorized and the
- *    FAIL flag will be cleared after some time.
- */
-void markNodeAsFailingIfNeeded(clusterNode *node) {
-    int failures;
-    int needed_quorum = (server.cluster->size / 2) + 1;
-
-    if (!nodeTimedOut(node)) return; /* We can reach it. */
-    if (nodeFailed(node)) return; /* Already FAILing. */
-
-    failures = clusterNodeFailureReportsCount(node);
-    /* Also count myself as a voter if I'm a master. */
-    if (nodeIsMaster(myself)) failures++;
-    if (failures < needed_quorum) return; /* No weak agreement from masters. */
-
-    serverLog(LL_NOTICE,
-        "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename);
-
-    /* Mark the node as failing. */
-    node->flags &= ~CLUSTER_NODE_PFAIL;
-    node->flags |= CLUSTER_NODE_FAIL;
-    node->fail_time = mstime();
-
-    /* Broadcast the failing node name to everybody, forcing all the other
-     * reachable nodes to flag the node as FAIL.
-     * We do that even if this node is a replica and not a master: anyway
-     * the failing state is triggered collecting failure reports from masters,
-     * so here the replica is only helping propagating this status. */
-    clusterSendFail(node->name);
-    clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-}
-
-/* This function is called only if a node is marked as FAIL, but we are able
- * to reach it again. It checks if there are the conditions to undo the FAIL
- * state. */
-void clearNodeFailureIfNeeded(clusterNode *node) {
-    mstime_t now = mstime();
-
-    serverAssert(nodeFailed(node));
-
-    /* For slaves we always clear the FAIL flag if we can contact the
-     * node again. */
-    if (nodeIsSlave(node) || node->numslots == 0) {
-        serverLog(LL_NOTICE,
-            "Clear FAIL state for node %.40s (%s):%s is reachable again.",
-                node->name,node->human_nodename,
-                nodeIsSlave(node) ? "replica" : "master without slots");
-        node->flags &= ~CLUSTER_NODE_FAIL;
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-    }
-
-    /* If it is a master and...
-     * 1) The FAIL state is old enough.
-     * 2) It is yet serving slots from our point of view (not failed over).
-     * Apparently no one is going to fix these slots, clear the FAIL flag. */
-    if (nodeIsMaster(node) && node->numslots > 0 &&
-        (now - node->fail_time) >
-        (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT))
-    {
-        serverLog(LL_NOTICE,
-            "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.",
-                node->name, node->human_nodename);
-        node->flags &= ~CLUSTER_NODE_FAIL;
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-    }
-}
-
-/* Return true if we already have a node in HANDSHAKE state matching the
- * specified ip address and port number. This function is used in order to
- * avoid adding a new handshake node for the same address multiple times. */
-int clusterHandshakeInProgress(char *ip, int port, int cport) {
-    dictIterator *di;
-    dictEntry *de;
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (!nodeInHandshake(node)) continue;
-        if (!strcasecmp(node->ip,ip) &&
-            getNodeDefaultClientPort(node) == port &&
-            node->cport == cport) break;
-    }
-    dictReleaseIterator(di);
-    return de != NULL;
-}
-
-/* Start a handshake with the specified address if there is not one
- * already in progress. Returns non-zero if the handshake was actually
- * started. On error zero is returned and errno is set to one of the
- * following values:
- *
- * EAGAIN - There is already a handshake in progress for this address.
- * EINVAL - IP or port are not valid. */
-int clusterStartHandshake(char *ip, int port, int cport) {
-    clusterNode *n;
-    char norm_ip[NET_IP_STR_LEN];
-    struct sockaddr_storage sa;
-
-    /* IP sanity check */
-    if (inet_pton(AF_INET,ip,
-            &(((struct sockaddr_in *)&sa)->sin_addr)))
-    {
-        sa.ss_family = AF_INET;
-    } else if (inet_pton(AF_INET6,ip,
-            &(((struct sockaddr_in6 *)&sa)->sin6_addr)))
-    {
-        sa.ss_family = AF_INET6;
-    } else {
-        errno = EINVAL;
-        return 0;
-    }
-
-    /* Port sanity check */
-    if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) {
-        errno = EINVAL;
-        return 0;
-    }
-
-    /* Set norm_ip as the normalized string representation of the node
-     * IP address. */
-    memset(norm_ip,0,NET_IP_STR_LEN);
-    if (sa.ss_family == AF_INET)
-        inet_ntop(AF_INET,
-            (void*)&(((struct sockaddr_in *)&sa)->sin_addr),
-            norm_ip,NET_IP_STR_LEN);
-    else
-        inet_ntop(AF_INET6,
-            (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr),
-            norm_ip,NET_IP_STR_LEN);
-
-    if (clusterHandshakeInProgress(norm_ip,port,cport)) {
-        errno = EAGAIN;
-        return 0;
-    }
-
-    /* Add the node with a random address (NULL as first argument to
-     * createClusterNode()). Everything will be fixed during the
-     * handshake. */
-    n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET);
-    memcpy(n->ip,norm_ip,sizeof(n->ip));
+ConnectionType *connTypeOfCluster(void) {
     if (server.tls_cluster) {
-        n->tls_port = port;
-    } else {
-        n->tcp_port = port;
+        return connectionTypeTls();
     }
-    n->cport = cport;
-    clusterAddNode(n);
-    return 1;
-}
 
-static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) {
-    if (server.tls_cluster) {
-        *tls_port = ntohs(hdr->port);
-        *tcp_port = ntohs(hdr->pport);
-    } else {
-        *tls_port = ntohs(hdr->pport);
-        *tcp_port = ntohs(hdr->port);
-    }
-}
-
-static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) {
-    if (server.tls_cluster) {
-        *tls_port = ntohs(g->port);
-        *tcp_port = ntohs(g->pport);
-    } else {
-        *tls_port = ntohs(g->pport);
-        *tcp_port = ntohs(g->port);
-    }
-}
-
-/* Process the gossip section of PING or PONG packets.
- * Note that this function assumes that the packet is already sanity-checked
- * by the caller, not in the content of the gossip section, but in the
- * length. */
-void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
-    uint16_t count = ntohs(hdr->count);
-    clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip;
-    clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
-
-    while(count--) {
-        uint16_t flags = ntohs(g->flags);
-        clusterNode *node;
-        sds ci;
-
-        if (server.verbosity == LL_DEBUG) {
-            ci = representClusterNodeFlags(sdsempty(), flags);
-            serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s",
-                g->nodename,
-                g->ip,
-                ntohs(g->port),
-                ntohs(g->cport),
-                ci);
-            sdsfree(ci);
-        }
-
-        /* Convert port and pport into TCP port and TLS port. */
-        int msg_tls_port, msg_tcp_port;
-        getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port);
-
-        /* Update our state accordingly to the gossip sections */
-        node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN);
-        if (node) {
-            /* We already know this node.
-               Handle failure reports, only when the sender is a master. */
-            if (sender && nodeIsMaster(sender) && node != myself) {
-                if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
-                    if (clusterNodeAddFailureReport(node,sender)) {
-                        serverLog(LL_VERBOSE,
-                            "Node %.40s (%s) reported node %.40s (%s) as not reachable.",
-                            sender->name, sender->human_nodename, node->name, node->human_nodename);
-                    }
-                    markNodeAsFailingIfNeeded(node);
-                } else {
-                    if (clusterNodeDelFailureReport(node,sender)) {
-                        serverLog(LL_VERBOSE,
-                            "Node %.40s (%s) reported node %.40s (%s) is back online.",
-                            sender->name, sender->human_nodename, node->name, node->human_nodename);
-                    }
-                }
-            }
-
-            /* If from our POV the node is up (no failure flags are set),
-             * we have no pending ping for the node, nor we have failure
-             * reports for this node, update the last pong time with the
-             * one we see from the other nodes. */
-            if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) &&
-                node->ping_sent == 0 &&
-                clusterNodeFailureReportsCount(node) == 0)
-            {
-                mstime_t pongtime = ntohl(g->pong_received);
-                pongtime *= 1000; /* Convert back to milliseconds. */
-
-                /* Replace the pong time with the received one only if
-                 * it's greater than our view but is not in the future
-                 * (with 500 milliseconds tolerance) from the POV of our
-                 * clock. */
-                if (pongtime <= (server.mstime+500) &&
-                    pongtime > node->pong_received)
-                {
-                    node->pong_received = pongtime;
-                }
-            }
-
-            /* If we already know this node, but it is not reachable, and
-             * we see a different address in the gossip section of a node that
-             * can talk with this other node, update the address, disconnect
-             * the old link if any, so that we'll attempt to connect with the
-             * new address. */
-            if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) &&
-                !(flags & CLUSTER_NODE_NOADDR) &&
-                !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) &&
-                (strcasecmp(node->ip,g->ip) ||
-                 node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) ||
-                 node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) ||
-                 node->cport != ntohs(g->cport)))
-            {
-                if (node->link) freeClusterLink(node->link);
-                memcpy(node->ip,g->ip,NET_IP_STR_LEN);
-                node->tcp_port = msg_tcp_port;
-                node->tls_port = msg_tls_port;
-                node->cport = ntohs(g->cport);
-                node->flags &= ~CLUSTER_NODE_NOADDR;
-            }
-        } else {
-            /* If it's not in NOADDR state and we don't have it, we
-             * add it to our trusted dict with exact nodeid and flag.
-             * Note that we cannot simply start a handshake against
-             * this IP/PORT pairs, since IP/PORT can be reused already,
-             * otherwise we risk joining another cluster.
-             *
-             * Note that we require that the sender of this gossip message
-             * is a well known node in our cluster, otherwise we risk
-             * joining another cluster. */
-            if (sender &&
-                !(flags & CLUSTER_NODE_NOADDR) &&
-                !clusterBlacklistExists(g->nodename))
-            {
-                clusterNode *node;
-                node = createClusterNode(g->nodename, flags);
-                memcpy(node->ip,g->ip,NET_IP_STR_LEN);
-                node->tcp_port = msg_tcp_port;
-                node->tls_port = msg_tls_port;
-                node->cport = ntohs(g->cport);
-                clusterAddNode(node);
-            }
-        }
-
-        /* Next node */
-        g++;
-    }
-}
-
-/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes.
- * If 'announced_ip' length is non-zero, it is used instead of extracting
- * the IP from the socket peer address. */
-int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) {
-    if (announced_ip[0] != '\0') {
-        memcpy(buf,announced_ip,NET_IP_STR_LEN);
-        buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */
-        return C_OK;
-    } else {
-        if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) {
-            serverLog(LL_NOTICE, "Error converting peer IP to string: %s",
-                link->conn ? connGetLastError(link->conn) : "no link");
-            return C_ERR;
-        }
-        return C_OK;
-    }
-}
-
-/* Update the node address to the IP address that can be extracted
- * from link->fd, or if hdr->myip is non empty, to the address the node
- * is announcing us. The port is taken from the packet header as well.
- *
- * If the address or port changed, disconnect the node link so that we'll
- * connect again to the new address.
- *
- * If the ip/port pair are already correct no operation is performed at
- * all.
- *
- * The function returns 0 if the node address is still the same,
- * otherwise 1 is returned. */
-int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link,
-                              clusterMsg *hdr)
-{
-    char ip[NET_IP_STR_LEN] = {0};
-    int cport = ntohs(hdr->cport);
-    int tcp_port, tls_port;
-    getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port);
-
-    /* We don't proceed if the link is the same as the sender link, as this
-     * function is designed to see if the node link is consistent with the
-     * symmetric link that is used to receive PINGs from the node.
-     *
-     * As a side effect this function never frees the passed 'link', so
-     * it is safe to call during packet processing. */
-    if (link == node->link) return 0;
-
-    /* If the peer IP is unavailable for some reasons like invalid fd or closed
-     * link, just give up the update this time, and the update will be retried
-     * in the next round of PINGs */
-    if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0;
-
-    if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port &&
-        strcmp(ip,node->ip) == 0) return 0;
-
-    /* IP / port is different, update it. */
-    memcpy(node->ip,ip,sizeof(ip));
-    node->tcp_port = tcp_port;
-    node->tls_port = tls_port;
-    node->cport = cport;
-    if (node->link) freeClusterLink(node->link);
-    node->flags &= ~CLUSTER_NODE_NOADDR;
-    serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d",
-        node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); 
-
-    /* Check if this is our master and we have to change the
-     * replication target as well. */
-    if (nodeIsSlave(myself) && myself->slaveof == node)
-        replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node));
-    return 1;
-}
-
-/* Reconfigure the specified node 'n' as a master. This function is called when
- * a node that we believed to be a slave is now acting as master in order to
- * update the state of the node. */
-void clusterSetNodeAsMaster(clusterNode *n) {
-    if (nodeIsMaster(n)) return;
-
-    if (n->slaveof) {
-        clusterNodeRemoveSlave(n->slaveof,n);
-        if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO;
-    }
-    n->flags &= ~CLUSTER_NODE_SLAVE;
-    n->flags |= CLUSTER_NODE_MASTER;
-    n->slaveof = NULL;
-
-    /* Update config and state. */
-    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                         CLUSTER_TODO_UPDATE_STATE);
-}
-
-/* This function is called when we receive a master configuration via a
- * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the
- * node, and the set of slots claimed under this configEpoch.
- *
- * What we do is to rebind the slots with newer configuration compared to our
- * local configuration, and if needed, we turn ourself into a replica of the
- * node (see the function comments for more info).
- *
- * The 'sender' is the node for which we received a configuration update.
- * Sometimes it is not actually the "Sender" of the information, like in the
- * case we receive the info via an UPDATE packet. */
-void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) {
-    int j;
-    clusterNode *curmaster = NULL, *newmaster = NULL;
-    /* The dirty slots list is a list of slots for which we lose the ownership
-     * while having still keys inside. This usually happens after a failover
-     * or after a manual cluster reconfiguration operated by the admin.
-     *
-     * If the update message is not able to demote a master to slave (in this
-     * case we'll resync with the master updating the whole key space), we
-     * need to delete all the keys in the slots we lost ownership. */
-    uint16_t dirty_slots[CLUSTER_SLOTS];
-    int dirty_slots_count = 0;
-
-    /* We should detect if sender is new master of our shard.
-     * We will know it if all our slots were migrated to sender, and sender
-     * has no slots except ours */
-    int sender_slots = 0;
-    int migrated_our_slots = 0;
-
-    /* Here we set curmaster to this node or the node this node
-     * replicates to if it's a slave. In the for loop we are
-     * interested to check if slots are taken away from curmaster. */
-    curmaster = nodeIsMaster(myself) ? myself : myself->slaveof;
-
-    if (sender == myself) {
-        serverLog(LL_NOTICE,"Discarding UPDATE message about myself.");
-        return;
-    }
-
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (bitmapTestBit(slots,j)) {
-            sender_slots++;
-
-            /* The slot is already bound to the sender of this message. */
-            if (server.cluster->slots[j] == sender) {
-                bitmapClearBit(server.cluster->owner_not_claiming_slot, j);
-                continue;
-            }
-
-            /* The slot is in importing state, it should be modified only
-             * manually via redis-cli (example: a resharding is in progress
-             * and the migrating side slot was already closed and is advertising
-             * a new config. We still want the slot to be closed manually). */
-            if (server.cluster->importing_slots_from[j]) continue;
-
-            /* We rebind the slot to the new node claiming it if:
-             * 1) The slot was unassigned or the previous owner no longer owns the slot or
-             *    the new node claims it with a greater configEpoch.
-             * 2) We are not currently importing the slot. */
-            if (isSlotUnclaimed(j) ||
-                server.cluster->slots[j]->configEpoch < senderConfigEpoch)
-            {
-                /* Was this slot mine, and still contains keys? Mark it as
-                 * a dirty slot. */
-                if (server.cluster->slots[j] == myself &&
-                    countKeysInSlot(j) &&
-                    sender != myself)
-                {
-                    dirty_slots[dirty_slots_count] = j;
-                    dirty_slots_count++;
-                }
-
-                if (server.cluster->slots[j] == curmaster) {
-                    newmaster = sender;
-                    migrated_our_slots++;
-                }
-                clusterDelSlot(j);
-                clusterAddSlot(sender,j);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                     CLUSTER_TODO_UPDATE_STATE|
-                                     CLUSTER_TODO_FSYNC_CONFIG);
-            }
-        } else if (server.cluster->slots[j] == sender) {
-            /* The slot is currently bound to the sender but the sender is no longer
-             * claiming it. We don't want to unbind the slot yet as it can cause the cluster
-             * to move to FAIL state and also throw client error. Keeping the slot bound to
-             * the previous owner will cause a few client side redirects, but won't throw
-             * any errors. We will keep track of the uncertainty in ownership to avoid
-             * propagating misinformation about this slot's ownership using UPDATE
-             * messages. */
-            bitmapSetBit(server.cluster->owner_not_claiming_slot, j);
-        }
-    }
-
-    /* After updating the slots configuration, don't do any actual change
-     * in the state of the server if a module disabled Redis Cluster
-     * keys redirections. */
-    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
-        return;
-
-    /* If at least one slot was reassigned from a node to another node
-     * with a greater configEpoch, it is possible that:
-     * 1) We are a master left without slots. This means that we were
-     *    failed over and we should turn into a replica of the new
-     *    master.
-     * 2) We are a slave and our master is left without slots. We need
-     *    to replicate to the new slots owner. */
-    if (newmaster && curmaster->numslots == 0 &&
-            (server.cluster_allow_replica_migration ||
-             sender_slots == migrated_our_slots)) {
-        serverLog(LL_NOTICE,
-            "Configuration change detected. Reconfiguring myself "
-            "as a replica of %.40s (%s)", sender->name, sender->human_nodename);
-        clusterSetMaster(sender);
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_UPDATE_STATE|
-                             CLUSTER_TODO_FSYNC_CONFIG);
-    } else if (myself->slaveof && myself->slaveof->slaveof &&
-               /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it
-                * can happen that myself is a replica of a replica of myself. If
-                * this happens, we do nothing to avoid a crash and wait for the
-                * admin to repair the cluster. */
-               myself->slaveof->slaveof != myself)
-    {
-        /* Safeguard against sub-replicas. A replica's master can turn itself
-         * into a replica if its last slot is removed. If no other node takes
-         * over the slot, there is nothing else to trigger replica migration. */
-        serverLog(LL_NOTICE,
-                  "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)",
-                  myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename);
-        clusterSetMaster(myself->slaveof->slaveof);
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_UPDATE_STATE|
-                             CLUSTER_TODO_FSYNC_CONFIG);
-    } else if (dirty_slots_count) {
-        /* If we are here, we received an update message which removed
-         * ownership for certain slots we still have keys about, but still
-         * we are serving some slots, so this master node was not demoted to
-         * a slave.
-         *
-         * In order to maintain a consistent state between keys and slots
-         * we need to remove all the keys from the slots we lost. */
-        for (j = 0; j < dirty_slots_count; j++)
-            delKeysInSlot(dirty_slots[j]);
-    }
-}
-
-/* Cluster ping extensions.
- *
- * The ping/pong/meet messages support arbitrary extensions to add additional
- * metadata to the messages that are sent between the various nodes in the
- * cluster. The extensions take the form:
- * [ Header length + type (8 bytes) ] 
- * [ Extension information (Arbitrary length, but must be 8 byte padded) ]
- */
-
-
-/* Returns the length of a given extension */
-static uint32_t getPingExtLength(clusterMsgPingExt *ext) {
-    return ntohl(ext->length);
-}
-
-/* Returns the initial position of ping extensions. May return an invalid
- * address if there are no ping extensions. */
-static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) {
-    clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]);
-    return initial;
-} 
-
-/* Given a current ping extension, returns the start of the next extension. May return
- * an invalid address if there are no further ping extensions. */
-static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) {
-    clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext));
-    return next;
-}
-
-/* All PING extensions must be 8-byte aligned */
-uint32_t getAlignedPingExtSize(uint32_t dataSize) {
-
-    return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize);
-}
-
-uint32_t getHostnamePingExtSize(void) {
-    if (sdslen(myself->hostname) == 0) {
-        return 0;
-    }
-    return getAlignedPingExtSize(sdslen(myself->hostname) + 1);
-}
-
-uint32_t getHumanNodenamePingExtSize(void) {
-    if (sdslen(myself->human_nodename) == 0) {
-        return 0;
-    }
-    return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1);
-}
-
-uint32_t getShardIdPingExtSize(void) {
-    return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId));
-}
-
-uint32_t getForgottenNodeExtSize(void) {
-    return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode));
-}
-
-void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) {
-    ext->type = htons(type);
-    ext->length = htonl(length);
-    return &ext->ext[0];
-}
-
-clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) {
-    return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length));
-}
-
-/* 1. If a NULL hdr is provided, compute the extension size;
- * 2. If a non-NULL hdr is provided, write the hostname ping
- *    extension at the start of the cursor. This function
- *    will update the cursor to point to the end of the
- *    written extension and will return the amount of bytes
- *    written. */
-uint32_t writePingExt(clusterMsg *hdr, int gossipcount)  {
-    uint16_t extensions = 0;
-    uint32_t totlen = 0;
-    clusterMsgPingExt *cursor = NULL;
-    /* Set the initial extension position */
-    if (hdr != NULL) {
-        cursor = getInitialPingExt(hdr, gossipcount);
-    }
-
-    /* hostname is optional */
-    if (sdslen(myself->hostname) != 0) {
-        if (cursor != NULL) {
-            /* Populate hostname */
-            clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize());
-            memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname));
-
-            /* Move the write cursor */
-            cursor = nextPingExt(cursor);
-        }
-
-        totlen += getHostnamePingExtSize();
-        extensions++;
-    }
-
-    if (sdslen(myself->human_nodename) != 0) {
-        if (cursor != NULL) {
-            /* Populate human_nodename */
-            clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize());
-            memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename));
-        
-            /* Move the write cursor */
-            cursor = nextPingExt(cursor);
-        }
-
-        totlen += getHumanNodenamePingExtSize();
-        extensions++;
-    }
-
-    /* Gossip forgotten nodes */
-    if (dictSize(server.cluster->nodes_black_list) > 0) {
-        dictIterator *di = dictGetIterator(server.cluster->nodes_black_list);
-        dictEntry *de;
-        while ((de = dictNext(di)) != NULL) {
-            if (cursor != NULL) {
-                uint64_t expire = dictGetUnsignedIntegerVal(de);
-                if ((time_t)expire < server.unixtime) continue; /* already expired */
-                uint64_t ttl = expire - server.unixtime;
-                clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize());
-                memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN);
-                ext->ttl = htonu64(ttl);
-
-                /* Move the write cursor */
-                cursor = nextPingExt(cursor);
-            }
-            totlen += getForgottenNodeExtSize();
-            extensions++;
-        }
-        dictReleaseIterator(di);
-    }
-
-    /* Populate shard_id */
-    if (cursor != NULL) {
-        clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize());
-        memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN);
-
-        /* Move the write cursor */
-        cursor = nextPingExt(cursor);
-    }
-    totlen += getShardIdPingExtSize();
-    extensions++;
-
-    if (hdr != NULL) {
-        if (extensions != 0) {
-            hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA;
-        }
-        hdr->extensions = htons(extensions);
-    }
-
-    return totlen;
-}
-
-/* We previously validated the extensions, so this function just needs to
- * handle the extensions. */
-void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) {
-    clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
-    char *ext_hostname = NULL;
-    char *ext_humannodename = NULL;
-    char *ext_shardid = NULL;
-    uint16_t extensions = ntohs(hdr->extensions);
-    /* Loop through all the extensions and process them */
-    clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count));
-    while (extensions--) {
-        uint16_t type = ntohs(ext->type);
-        if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) {
-            clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname);
-            ext_hostname = hostname_ext->hostname;
-        } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) {
-            clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename);
-            ext_humannodename = humannodename_ext->human_nodename;
-        } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) {
-            clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node);
-            clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN);
-            if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) {
-                sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN);
-                dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id);
-                uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl);
-                dictSetUnsignedIntegerVal(de, expire);
-                clusterDelNode(n);
-                clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
-                                     CLUSTER_TODO_SAVE_CONFIG);
-            }
-        } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) {
-            clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id);
-            ext_shardid = shardid_ext->shard_id;
-        } else {
-            /* Unknown type, we will ignore it but log what happened. */
-            serverLog(LL_WARNING, "Received unknown extension type %d", type);
-        }
-
-        /* We know this will be valid since we validated it ahead of time */
-        ext = getNextPingExt(ext);
-    }
-    /* If the node did not send us a hostname extension, assume
-     * they don't have an announced hostname. Otherwise, we'll
-     * set it now. */
-    updateAnnouncedHostname(sender, ext_hostname);
-    updateAnnouncedHumanNodename(sender, ext_humannodename);
-    updateShardId(sender, ext_shardid);
-}
-
-static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) {
-    clusterNode *sender;
-    if (link->node && !nodeInHandshake(link->node)) {
-        /* If the link has an associated node, use that so that we don't have to look it
-         * up every time, except when the node is still in handshake, the node still has
-         * a random name thus not truly "known". */
-        sender = link->node;
-    } else {
-        /* Otherwise, fetch sender based on the message */
-        sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
-        /* We know the sender node but haven't associate it with the link. This must
-         * be an inbound link because only for inbound links we didn't know which node
-         * to associate when they were created. */
-        if (sender && !link->node) {
-            setClusterNodeToInboundClusterLink(sender, link);
-        }
-    }
-    return sender;
-}
-
-/* When this function is called, there is a packet to process starting
- * at link->rcvbuf. Releasing the buffer is up to the caller, so this
- * function should just handle the higher level stuff of processing the
- * packet, modifying the cluster state if needed.
- *
- * The function returns 1 if the link is still valid after the packet
- * was processed, otherwise 0 if the link was freed since the packet
- * processing lead to some inconsistency error (for instance a PONG
- * received from the wrong sender ID). */
-int clusterProcessPacket(clusterLink *link) {
-    clusterMsg *hdr = (clusterMsg*) link->rcvbuf;
-    uint32_t totlen = ntohl(hdr->totlen);
-    uint16_t type = ntohs(hdr->type);
-    mstime_t now = mstime();
-
-    if (type < CLUSTERMSG_TYPE_COUNT)
-        server.cluster->stats_bus_messages_received[type]++;
-    serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes",
-        clusterGetMessageTypeString(type), (unsigned long) totlen);
-
-    /* Perform sanity checks */
-    if (totlen < 16) return 1; /* At least signature, version, totlen, count. */
-    if (totlen > link->rcvbuf_len) return 1;
-
-    if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) {
-        /* Can't handle messages of different versions. */
-        return 1;
-    }
-
-    if (type == server.cluster_drop_packet_filter) {
-        serverLog(LL_WARNING, "Dropping packet that matches debug drop filter");
-        return 1;
-    }
-
-    uint16_t flags = ntohs(hdr->flags);
-    uint16_t extensions = ntohs(hdr->extensions);
-    uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0;
-    uint32_t explen; /* expected length of this packet */
-    clusterNode *sender;
-
-    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
-        type == CLUSTERMSG_TYPE_MEET)
-    {
-        uint16_t count = ntohs(hdr->count);
-
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-        explen += (sizeof(clusterMsgDataGossip)*count);
-
-        /* If there is extension data, which doesn't have a fixed length,
-         * loop through them and validate the length of it now. */
-        if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
-            clusterMsgPingExt *ext = getInitialPingExt(hdr, count);
-            while (extensions--) {
-                uint16_t extlen = getPingExtLength(ext);
-                if (extlen % 8 != 0) {
-                    serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)",
-                        clusterGetMessageTypeString(type), (int) extlen);
-                    return 1;
-                }
-                if ((totlen - explen) < extlen) {
-                    serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds "
-                        "total packet length (%lld)", clusterGetMessageTypeString(type),
-                        (unsigned long long) totlen);
-                    return 1;
-                }
-                explen += extlen;
-                ext = getNextPingExt(ext);
-            }
-        }
-    } else if (type == CLUSTERMSG_TYPE_FAIL) {
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-        explen += sizeof(clusterMsgDataFail);
-    } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-        explen += sizeof(clusterMsgDataPublish) -
-                8 +
-                ntohl(hdr->data.publish.msg.channel_len) +
-                ntohl(hdr->data.publish.msg.message_len);
-    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST ||
-               type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK ||
-               type == CLUSTERMSG_TYPE_MFSTART)
-    {
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    } else if (type == CLUSTERMSG_TYPE_UPDATE) {
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-        explen += sizeof(clusterMsgDataUpdate);
-    } else if (type == CLUSTERMSG_TYPE_MODULE) {
-        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-        explen += sizeof(clusterMsgModule) -
-                3 + ntohl(hdr->data.module.msg.len);
-    } else {
-        /* We don't know this type of packet, so we assume it's well formed. */
-        explen = totlen;
-    }
-
-    if (totlen != explen) {
-        serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld",
-            clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen);
-        return 1;
-    }
-
-    sender = getNodeFromLinkAndMsg(link, hdr);
-
-    /* Update the last time we saw any data from this node. We
-     * use this in order to avoid detecting a timeout from a node that
-     * is just sending a lot of data in the cluster bus, for instance
-     * because of Pub/Sub. */
-    if (sender) sender->data_received = now;
-
-    if (sender && !nodeInHandshake(sender)) {
-        /* Update our currentEpoch if we see a newer epoch in the cluster. */
-        senderCurrentEpoch = ntohu64(hdr->currentEpoch);
-        senderConfigEpoch = ntohu64(hdr->configEpoch);
-        if (senderCurrentEpoch > server.cluster->currentEpoch)
-            server.cluster->currentEpoch = senderCurrentEpoch;
-        /* Update the sender configEpoch if it is publishing a newer one. */
-        if (senderConfigEpoch > sender->configEpoch) {
-            sender->configEpoch = senderConfigEpoch;
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                 CLUSTER_TODO_FSYNC_CONFIG);
-        }
-        /* Update the replication offset info for this node. */
-        sender->repl_offset = ntohu64(hdr->offset);
-        sender->repl_offset_time = now;
-        /* If we are a slave performing a manual failover and our master
-         * sent its offset while already paused, populate the MF state. */
-        if (server.cluster->mf_end &&
-            nodeIsSlave(myself) &&
-            myself->slaveof == sender &&
-            hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED &&
-            server.cluster->mf_master_offset == -1)
-        {
-            server.cluster->mf_master_offset = sender->repl_offset;
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
-            serverLog(LL_NOTICE,
-                "Received replication offset for paused "
-                "master manual failover: %lld",
-                server.cluster->mf_master_offset);
-        }
-    }
-
-    /* Initial processing of PING and MEET requests replying with a PONG. */
-    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
-        /* We use incoming MEET messages in order to set the address
-         * for 'myself', since only other cluster nodes will send us
-         * MEET messages on handshakes, when the cluster joins, or
-         * later if we changed address, and those nodes will use our
-         * official address to connect to us. So by obtaining this address
-         * from the socket is a simple way to discover / update our own
-         * address in the cluster without it being hardcoded in the config.
-         *
-         * However if we don't have an address at all, we update the address
-         * even with a normal PING packet. If it's wrong it will be fixed
-         * by MEET later. */
-        if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') &&
-            server.cluster_announce_ip == NULL)
-        {
-            char ip[NET_IP_STR_LEN];
-
-            if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 &&
-                strcmp(ip,myself->ip))
-            {
-                memcpy(myself->ip,ip,NET_IP_STR_LEN);
-                serverLog(LL_NOTICE,"IP address for this node updated to %s",
-                    myself->ip);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-            }
-        }
-
-        /* Add this node if it is new for us and the msg type is MEET.
-         * In this stage we don't try to add the node with the right
-         * flags, slaveof pointer, and so forth, as this details will be
-         * resolved when we'll receive PONGs from the node. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET) {
-            clusterNode *node;
-
-            node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE);
-            serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK);
-            getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
-            node->cport = ntohs(hdr->cport);
-            clusterAddNode(node);
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-        }
-
-        /* If this is a MEET packet from an unknown node, we still process
-         * the gossip section here since we have to trust the sender because
-         * of the message type. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET)
-            clusterProcessGossipSection(hdr,link);
-
-        /* Anyway reply with a PONG */
-        clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
-    }
-
-    /* PING, PONG, MEET: process config information. */
-    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
-        type == CLUSTERMSG_TYPE_MEET)
-    {
-        serverLog(LL_DEBUG,"%s packet received: %.40s",
-            clusterGetMessageTypeString(type),
-            link->node ? link->node->name : "NULL");
-        if (!link->inbound) {
-            if (nodeInHandshake(link->node)) {
-                /* If we already have this node, try to change the
-                 * IP/port of the node with the new one. */
-                if (sender) {
-                    serverLog(LL_VERBOSE,
-                        "Handshake: we already know node %.40s (%s), "
-                        "updating the address if needed.", sender->name, sender->human_nodename);
-                    if (nodeUpdateAddressIfNeeded(sender,link,hdr))
-                    {
-                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                             CLUSTER_TODO_UPDATE_STATE);
-                    }
-                    /* Free this node as we already have it. This will
-                     * cause the link to be freed as well. */
-                    clusterDelNode(link->node);
-                    return 0;
-                }
-
-                /* First thing to do is replacing the random name with the
-                 * right node name if this was a handshake stage. */
-                clusterRenameNode(link->node, hdr->sender);
-                serverLog(LL_DEBUG,"Handshake with node %.40s completed.",
-                    link->node->name);
-                link->node->flags &= ~CLUSTER_NODE_HANDSHAKE;
-                link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-            } else if (memcmp(link->node->name,hdr->sender,
-                        CLUSTER_NAMELEN) != 0)
-            {
-                /* If the reply has a non matching node ID we
-                 * disconnect this node and set it as not having an associated
-                 * address. */
-                serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d",
-                    link->node->name,
-                    (int)(now-(link->node->ctime)),
-                    link->node->flags);
-                link->node->flags |= CLUSTER_NODE_NOADDR;
-                link->node->ip[0] = '\0';
-                link->node->tcp_port = 0;
-                link->node->tls_port = 0;
-                link->node->cport = 0;
-                freeClusterLink(link);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-                return 0;
-            }
-        }
-
-        /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender
-         * announced. This is a dynamic flag that we receive from the
-         * sender, and the latest status must be trusted. We need it to
-         * be propagated because the slave ranking used to understand the
-         * delay of each slave in the voting process, needs to know
-         * what are the instances really competing. */
-        if (sender) {
-            int nofailover = flags & CLUSTER_NODE_NOFAILOVER;
-            sender->flags &= ~CLUSTER_NODE_NOFAILOVER;
-            sender->flags |= nofailover;
-        }
-
-        /* Update the node address if it changed. */
-        if (sender && type == CLUSTERMSG_TYPE_PING &&
-            !nodeInHandshake(sender) &&
-            nodeUpdateAddressIfNeeded(sender,link,hdr))
-        {
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                 CLUSTER_TODO_UPDATE_STATE);
-        }
-
-        /* Update our info about the node */
-        if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) {
-            link->node->pong_received = now;
-            link->node->ping_sent = 0;
-
-            /* The PFAIL condition can be reversed without external
-             * help if it is momentary (that is, if it does not
-             * turn into a FAIL state).
-             *
-             * The FAIL condition is also reversible under specific
-             * conditions detected by clearNodeFailureIfNeeded(). */
-            if (nodeTimedOut(link->node)) {
-                link->node->flags &= ~CLUSTER_NODE_PFAIL;
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                     CLUSTER_TODO_UPDATE_STATE);
-            } else if (nodeFailed(link->node)) {
-                clearNodeFailureIfNeeded(link->node);
-            }
-        }
-
-        /* Check for role switch: slave -> master or master -> slave. */
-        if (sender) {
-            if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME,
-                sizeof(hdr->slaveof)))
-            {
-                /* Node is a master. */
-                clusterSetNodeAsMaster(sender);
-            } else {
-                /* Node is a slave. */
-                clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN);
-
-                if (nodeIsMaster(sender)) {
-                    /* Master turned into a slave! Reconfigure the node. */
-                    clusterDelNodeSlots(sender);
-                    sender->flags &= ~(CLUSTER_NODE_MASTER|
-                                       CLUSTER_NODE_MIGRATE_TO);
-                    sender->flags |= CLUSTER_NODE_SLAVE;
-
-                    /* Update config and state. */
-                    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                         CLUSTER_TODO_UPDATE_STATE);
-                }
-
-                /* Master node changed for this slave? */
-                if (master && sender->slaveof != master) {
-                    if (sender->slaveof)
-                        clusterNodeRemoveSlave(sender->slaveof,sender);
-                    clusterNodeAddSlave(master,sender);
-                    sender->slaveof = master;
-
-                    /* Update config. */
-                    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
-                }
-            }
-        }
-
-        /* Update our info about served slots.
-         *
-         * Note: this MUST happen after we update the master/slave state
-         * so that CLUSTER_NODE_MASTER flag will be set. */
-
-        /* Many checks are only needed if the set of served slots this
-         * instance claims is different compared to the set of slots we have
-         * for it. Check this ASAP to avoid other computational expansive
-         * checks later. */
-        clusterNode *sender_master = NULL; /* Sender or its master if slave. */
-        int dirty_slots = 0; /* Sender claimed slots don't match my view? */
-
-        if (sender) {
-            sender_master = nodeIsMaster(sender) ? sender : sender->slaveof;
-            if (sender_master) {
-                dirty_slots = memcmp(sender_master->slots,
-                        hdr->myslots,sizeof(hdr->myslots)) != 0;
-            }
-        }
-
-        /* 1) If the sender of the message is a master, and we detected that
-         *    the set of slots it claims changed, scan the slots to see if we
-         *    need to update our configuration. */
-        if (sender && nodeIsMaster(sender) && dirty_slots)
-            clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots);
-
-        /* 2) We also check for the reverse condition, that is, the sender
-         *    claims to serve slots we know are served by a master with a
-         *    greater configEpoch. If this happens we inform the sender.
-         *
-         * This is useful because sometimes after a partition heals, a
-         * reappearing master may be the last one to claim a given set of
-         * hash slots, but with a configuration that other instances know to
-         * be deprecated. Example:
-         *
-         * A and B are master and slave for slots 1,2,3.
-         * A is partitioned away, B gets promoted.
-         * B is partitioned away, and A returns available.
-         *
-         * Usually B would PING A publishing its set of served slots and its
-         * configEpoch, but because of the partition B can't inform A of the
-         * new configuration, so other nodes that have an updated table must
-         * do it. In this way A will stop to act as a master (or can try to
-         * failover if there are the conditions to win the election). */
-        if (sender && dirty_slots) {
-            int j;
-
-            for (j = 0; j < CLUSTER_SLOTS; j++) {
-                if (bitmapTestBit(hdr->myslots,j)) {
-                    if (server.cluster->slots[j] == sender ||
-                        isSlotUnclaimed(j)) continue;
-                    if (server.cluster->slots[j]->configEpoch >
-                        senderConfigEpoch)
-                    {
-                        serverLog(LL_VERBOSE,
-                            "Node %.40s has old slots configuration, sending "
-                            "an UPDATE message about %.40s",
-                                sender->name, server.cluster->slots[j]->name);
-                        clusterSendUpdate(sender->link,
-                            server.cluster->slots[j]);
-
-                        /* TODO: instead of exiting the loop send every other
-                         * UPDATE packet for other nodes that are the new owner
-                         * of sender's slots. */
-                        break;
-                    }
-                }
-            }
-        }
-
-        /* If our config epoch collides with the sender's try to fix
-         * the problem. */
-        if (sender &&
-            nodeIsMaster(myself) && nodeIsMaster(sender) &&
-            senderConfigEpoch == myself->configEpoch)
-        {
-            clusterHandleConfigEpochCollision(sender);
-        }
-
-        /* Get info from the gossip section */
-        if (sender) {
-            clusterProcessGossipSection(hdr,link);
-            clusterProcessPingExtensions(hdr,link);
-        }
-    } else if (type == CLUSTERMSG_TYPE_FAIL) {
-        clusterNode *failing;
-
-        if (sender) {
-            failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN);
-            if (failing &&
-                !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
-            {
-                serverLog(LL_NOTICE,
-                    "FAIL message received from %.40s (%s) about %.40s (%s)",
-                    hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename);
-                failing->flags |= CLUSTER_NODE_FAIL;
-                failing->fail_time = now;
-                failing->flags &= ~CLUSTER_NODE_PFAIL;
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                                     CLUSTER_TODO_UPDATE_STATE);
-            }
-        } else {
-            serverLog(LL_NOTICE,
-                "Ignoring FAIL message from unknown node %.40s about %.40s",
-                hdr->sender, hdr->data.fail.about.nodename);
-        }
-    } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
-        if (!sender) return 1;  /* We don't know that node. */
-
-        robj *channel, *message;
-        uint32_t channel_len, message_len;
-
-        /* Don't bother creating useless objects if there are no
-         * Pub/Sub subscribers. */
-        if ((type == CLUSTERMSG_TYPE_PUBLISH
-            && serverPubsubSubscriptionCount() > 0)
-        || (type == CLUSTERMSG_TYPE_PUBLISHSHARD
-            && serverPubsubShardSubscriptionCount() > 0))
-        {
-            channel_len = ntohl(hdr->data.publish.msg.channel_len);
-            message_len = ntohl(hdr->data.publish.msg.message_len);
-            channel = createStringObject(
-                        (char*)hdr->data.publish.msg.bulk_data,channel_len);
-            message = createStringObject(
-                        (char*)hdr->data.publish.msg.bulk_data+channel_len,
-                        message_len);
-            pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD);
-            decrRefCount(channel);
-            decrRefCount(message);
-        }
-    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
-        if (!sender) return 1;  /* We don't know that node. */
-        clusterSendFailoverAuthIfNeeded(sender,hdr);
-    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
-        if (!sender) return 1;  /* We don't know that node. */
-        /* We consider this vote only if the sender is a master serving
-         * a non zero number of slots, and its currentEpoch is greater or
-         * equal to epoch where this node started the election. */
-        if (nodeIsMaster(sender) && sender->numslots > 0 &&
-            senderCurrentEpoch >= server.cluster->failover_auth_epoch)
-        {
-            server.cluster->failover_auth_count++;
-            /* Maybe we reached a quorum here, set a flag to make sure
-             * we check ASAP. */
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
-        }
-    } else if (type == CLUSTERMSG_TYPE_MFSTART) {
-        /* This message is acceptable only if I'm a master and the sender
-         * is one of my slaves. */
-        if (!sender || sender->slaveof != myself) return 1;
-        /* Manual failover requested from slaves. Initialize the state
-         * accordingly. */
-        resetManualFailover();
-        server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
-        server.cluster->mf_slave = sender;
-        pauseActions(PAUSE_DURING_FAILOVER,
-                     now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
-                     PAUSE_ACTIONS_CLIENT_WRITE_SET);
-        serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).",
-            sender->name, sender->human_nodename);
-        /* We need to send a ping message to the replica, as it would carry
-         * `server.cluster->mf_master_offset`, which means the master paused clients
-         * at offset `server.cluster->mf_master_offset`, so that the replica would
-         * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
-         * to complete failover as quickly as possible. */
-        clusterSendPing(link, CLUSTERMSG_TYPE_PING);
-    } else if (type == CLUSTERMSG_TYPE_UPDATE) {
-        clusterNode *n; /* The node the update is about. */
-        uint64_t reportedConfigEpoch =
-                    ntohu64(hdr->data.update.nodecfg.configEpoch);
-
-        if (!sender) return 1;  /* We don't know the sender. */
-        n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN);
-        if (!n) return 1;   /* We don't know the reported node. */
-        if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */
-
-        /* If in our current config the node is a slave, set it as a master. */
-        if (nodeIsSlave(n)) clusterSetNodeAsMaster(n);
-
-        /* Update the node's configEpoch. */
-        n->configEpoch = reportedConfigEpoch;
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_FSYNC_CONFIG);
-
-        /* Check the bitmap of served slots and update our
-         * config accordingly. */
-        clusterUpdateSlotsConfigWith(n,reportedConfigEpoch,
-            hdr->data.update.nodecfg.slots);
-    } else if (type == CLUSTERMSG_TYPE_MODULE) {
-        if (!sender) return 1;  /* Protect the module from unknown nodes. */
-        /* We need to route this message back to the right module subscribed
-         * for the right message type. */
-        uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */
-        uint32_t len = ntohl(hdr->data.module.msg.len);
-        uint8_t type = hdr->data.module.msg.type;
-        unsigned char *payload = hdr->data.module.msg.bulk_data;
-        moduleCallClusterReceivers(sender->name,module_id,type,payload,len);
-    } else {
-        serverLog(LL_WARNING,"Received unknown packet type: %d", type);
-    }
-    return 1;
-}
-
-/* This function is called when we detect the link with this node is lost.
-   We set the node as no longer connected. The Cluster Cron will detect
-   this connection and will try to get it connected again.
-
-   Instead if the node is a temporary node used to accept a query, we
-   completely free the node on error. */
-void handleLinkIOError(clusterLink *link) {
-    freeClusterLink(link);
-}
-
-/* Send the messages queued for the link. */
-void clusterWriteHandler(connection *conn) {
-    clusterLink *link = connGetPrivateData(conn);
-    ssize_t nwritten;
-    size_t totwritten = 0;
-
-    while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) {
-        listNode *head = listFirst(link->send_msg_queue);
-        clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value;
-        clusterMsg *msg = &msgblock->msg;
-        size_t msg_offset = link->head_msg_send_offset;
-        size_t msg_len = ntohl(msg->totlen);
-
-        nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset);
-        if (nwritten <= 0) {
-            serverLog(LL_DEBUG,"I/O error writing to node link: %s",
-                (nwritten == -1) ? connGetLastError(conn) : "short write");
-            handleLinkIOError(link);
-            return;
-        }
-        if (msg_offset + nwritten < msg_len) {
-            /* If full message wasn't written, record the offset
-             * and continue sending from this point next time */
-            link->head_msg_send_offset += nwritten;
-            return;
-        }
-        serverAssert((msg_offset + nwritten) == msg_len);
-        link->head_msg_send_offset = 0;
-
-        /* Delete the node and update our memory tracking */
-        uint32_t blocklen = msgblock->totlen;
-        listDelNode(link->send_msg_queue, head);
-        server.stat_cluster_links_memory -= sizeof(listNode);
-        link->send_msg_queue_mem -= sizeof(listNode) + blocklen;
-
-        totwritten += nwritten;
-    }
-
-    if (listLength(link->send_msg_queue) == 0)
-        connSetWriteHandler(link->conn, NULL);
-}
-
-/* A connect handler that gets called when a connection to another node
- * gets established.
- */
-void clusterLinkConnectHandler(connection *conn) {
-    clusterLink *link = connGetPrivateData(conn);
-    clusterNode *node = link->node;
-
-    /* Check if connection succeeded */
-    if (connGetState(conn) != CONN_STATE_CONNECTED) {
-        serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s",
-                node->name, node->ip, node->cport,
-                connGetLastError(conn));
-        freeClusterLink(link);
-        return;
-    }
-
-    /* Register a read handler from now on */
-    connSetReadHandler(conn, clusterReadHandler);
-
-    /* Queue a PING in the new connection ASAP: this is crucial
-     * to avoid false positives in failure detection.
-     *
-     * If the node is flagged as MEET, we send a MEET message instead
-     * of a PING one, to force the receiver to add us in its node
-     * table. */
-    mstime_t old_ping_sent = node->ping_sent;
-    clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ?
-            CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
-    if (old_ping_sent) {
-        /* If there was an active ping before the link was
-         * disconnected, we want to restore the ping time, otherwise
-         * replaced by the clusterSendPing() call. */
-        node->ping_sent = old_ping_sent;
-    }
-    /* We can clear the flag after the first packet is sent.
-     * If we'll never receive a PONG, we'll never send new packets
-     * to this node. Instead after the PONG is received and we
-     * are no longer in meet/handshake status, we want to send
-     * normal PING packets. */
-    node->flags &= ~CLUSTER_NODE_MEET;
-
-    serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d",
-            node->name, node->ip, node->cport);
-}
-
-/* Read data. Try to read the first field of the header first to check the
- * full length of the packet. When a whole packet is in memory this function
- * will call the function to process the packet. And so forth. */
-void clusterReadHandler(connection *conn) {
-    clusterMsg buf[1];
-    ssize_t nread;
-    clusterMsg *hdr;
-    clusterLink *link = connGetPrivateData(conn);
-    unsigned int readlen, rcvbuflen;
-
-    while(1) { /* Read as long as there is data to read. */
-        rcvbuflen = link->rcvbuf_len;
-        if (rcvbuflen < 8) {
-            /* First, obtain the first 8 bytes to get the full message
-             * length. */
-            readlen = 8 - rcvbuflen;
-        } else {
-            /* Finally read the full message. */
-            hdr = (clusterMsg*) link->rcvbuf;
-            if (rcvbuflen == 8) {
-                /* Perform some sanity check on the message signature
-                 * and length. */
-                if (memcmp(hdr->sig,"RCmb",4) != 0 ||
-                    ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN)
-                {
-                    char ip[NET_IP_STR_LEN];
-                    int port;
-                    if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) {
-                        serverLog(LL_WARNING,
-                            "Bad message length or signature received "
-                            "on the Cluster bus.");
-                    } else {
-                        serverLog(LL_WARNING,
-                            "Bad message length or signature received "
-                            "on the Cluster bus from %s:%d", ip, port);
-                    }
-                    handleLinkIOError(link);
-                    return;
-                }
-            }
-            readlen = ntohl(hdr->totlen) - rcvbuflen;
-            if (readlen > sizeof(buf)) readlen = sizeof(buf);
-        }
-
-        nread = connRead(conn,buf,readlen);
-        if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */
-
-        if (nread <= 0) {
-            /* I/O error... */
-            serverLog(LL_DEBUG,"I/O error reading from node link: %s",
-                (nread == 0) ? "connection closed" : connGetLastError(conn));
-            handleLinkIOError(link);
-            return;
-        } else {
-            /* Read data and recast the pointer to the new buffer. */
-            size_t unused = link->rcvbuf_alloc - link->rcvbuf_len;
-            if ((size_t)nread > unused) {
-                size_t required = link->rcvbuf_len + nread;
-                size_t prev_rcvbuf_alloc = link->rcvbuf_alloc;
-                /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */
-                link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC;
-                link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc);
-                server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc;
-            }
-            memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread);
-            link->rcvbuf_len += nread;
-            hdr = (clusterMsg*) link->rcvbuf;
-            rcvbuflen += nread;
-        }
-
-        /* Total length obtained? Process this packet. */
-        if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
-            if (clusterProcessPacket(link)) {
-                if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) {
-                    size_t prev_rcvbuf_alloc = link->rcvbuf_alloc;
-                    zfree(link->rcvbuf);
-                    link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
-                    server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc;
-                }
-                link->rcvbuf_len = 0;
-            } else {
-                return; /* Link no longer valid. */
-            }
-        }
-    }
-}
-
-/* Put the message block into the link's send queue.
- *
- * It is guaranteed that this function will never have as a side effect
- * the link to be invalidated, so it is safe to call this function
- * from event handlers that will do stuff with the same link later. */
-void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
-    if (!link) {
-        return;
-    }
-    if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0)
-        connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1);
-
-    listAddNodeTail(link->send_msg_queue, msgblock);
-    msgblock->refcount++;
-
-    /* Update memory tracking */
-    link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen;
-    server.stat_cluster_links_memory += sizeof(listNode);
-
-    /* Populate sent messages stats. */
-    uint16_t type = ntohs(msgblock->msg.type);
-    if (type < CLUSTERMSG_TYPE_COUNT)
-        server.cluster->stats_bus_messages_sent[type]++;
-}
-
-/* Send a message to all the nodes that are part of the cluster having
- * a connected link.
- *
- * It is guaranteed that this function will never have as a side effect
- * some node->link to be invalidated, so it is safe to call this function
- * from event handlers that will do stuff with node links later. */
-void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) {
-    dictIterator *di;
-    dictEntry *de;
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
-            continue;
-        clusterSendMessage(node->link,msgblock);
-    }
-    dictReleaseIterator(di);
-}
-
-/* Build the message header. hdr must point to a buffer at least
- * sizeof(clusterMsg) in bytes. */
-static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) {
-    uint64_t offset;
-    clusterNode *master;
-
-    /* If this node is a master, we send its slots bitmap and configEpoch.
-     * If this node is a slave we send the master's information instead (the
-     * node is flagged as slave so the receiver knows that it is NOT really
-     * in charge for this slots. */
-    master = (nodeIsSlave(myself) && myself->slaveof) ?
-              myself->slaveof : myself;
-
-    hdr->ver = htons(CLUSTER_PROTO_VER);
-    hdr->sig[0] = 'R';
-    hdr->sig[1] = 'C';
-    hdr->sig[2] = 'm';
-    hdr->sig[3] = 'b';
-    hdr->type = htons(type);
-    memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN);
-
-    /* If cluster-announce-ip option is enabled, force the receivers of our
-     * packets to use the specified address for this node. Otherwise if the
-     * first byte is zero, they'll do auto discovery. */
-    memset(hdr->myip,0,NET_IP_STR_LEN);
-    if (server.cluster_announce_ip) {
-        redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN);
-    }
-
-    /* Handle cluster-announce-[tls-|bus-]port. */
-    int announced_tcp_port, announced_tls_port, announced_cport;
-    deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport);
-
-    memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots));
-    memset(hdr->slaveof,0,CLUSTER_NAMELEN);
-    if (myself->slaveof != NULL)
-        memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN);
-    if (server.tls_cluster) {
-        hdr->port = htons(announced_tls_port);
-        hdr->pport = htons(announced_tcp_port);
-    } else {
-        hdr->port = htons(announced_tcp_port);
-        hdr->pport = htons(announced_tls_port);
-    }
-    hdr->cport = htons(announced_cport);
-    hdr->flags = htons(myself->flags);
-    hdr->state = server.cluster->state;
-
-    /* Set the currentEpoch and configEpochs. */
-    hdr->currentEpoch = htonu64(server.cluster->currentEpoch);
-    hdr->configEpoch = htonu64(master->configEpoch);
-
-    /* Set the replication offset. */
-    if (nodeIsSlave(myself))
-        offset = replicationGetSlaveOffset();
-    else
-        offset = server.master_repl_offset;
-    hdr->offset = htonu64(offset);
-
-    /* Set the message flags. */
-    if (nodeIsMaster(myself) && server.cluster->mf_end)
-        hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED;
-
-    hdr->totlen = htonl(msglen);
-}
-
-/* Set the i-th entry of the gossip section in the message pointed by 'hdr'
- * to the info of the specified node 'n'. */
-void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) {
-    clusterMsgDataGossip *gossip;
-    gossip = &(hdr->data.ping.gossip[i]);
-    memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN);
-    gossip->ping_sent = htonl(n->ping_sent/1000);
-    gossip->pong_received = htonl(n->pong_received/1000);
-    memcpy(gossip->ip,n->ip,sizeof(n->ip));
-    if (server.tls_cluster) {
-        gossip->port = htons(n->tls_port);
-        gossip->pport = htons(n->tcp_port);
-    } else {
-        gossip->port = htons(n->tcp_port);
-        gossip->pport = htons(n->tls_port);
-    }
-    gossip->cport = htons(n->cport);
-    gossip->flags = htons(n->flags);
-    gossip->notused1 = 0;
-}
-
-/* Send a PING or PONG packet to the specified node, making sure to add enough
- * gossip information. */
-void clusterSendPing(clusterLink *link, int type) {
-    static unsigned long long cluster_pings_sent = 0;
-    cluster_pings_sent++;
-    int gossipcount = 0; /* Number of gossip sections added so far. */
-    int wanted; /* Number of gossip sections we want to append if possible. */
-    int estlen; /* Upper bound on estimated packet length */
-    /* freshnodes is the max number of nodes we can hope to append at all:
-     * nodes available minus two (ourself and the node we are sending the
-     * message to). However practically there may be less valid nodes since
-     * nodes in handshake state, disconnected, are not considered. */
-    int freshnodes = dictSize(server.cluster->nodes)-2;
-
-    /* How many gossip sections we want to add? 1/10 of the number of nodes
-     * and anyway at least 3. Why 1/10?
-     *
-     * If we have N masters, with N/10 entries, and we consider that in
-     * node_timeout we exchange with each other node at least 4 packets
-     * (we ping in the worst case in node_timeout/2 time, and we also
-     * receive two pings from the host), we have a total of 8 packets
-     * in the node_timeout*2 failure reports validity time. So we have
-     * that, for a single PFAIL node, we can expect to receive the following
-     * number of failure reports (in the specified window of time):
-     *
-     * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
-     *
-     * PROB = probability of being featured in a single gossip entry,
-     *        which is 1 / NUM_OF_NODES.
-     * ENTRIES = 10.
-     * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
-     *
-     * If we assume we have just masters (so num of nodes and num of masters
-     * is the same), with 1/10 we always get over the majority, and specifically
-     * 80% of the number of nodes, to account for many masters failing at the
-     * same time.
-     *
-     * Since we have non-voting slaves that lower the probability of an entry
-     * to feature our node, we set the number of entries per packet as
-     * 10% of the total nodes we have. */
-    wanted = floor(dictSize(server.cluster->nodes)/10);
-    if (wanted < 3) wanted = 3;
-    if (wanted > freshnodes) wanted = freshnodes;
-
-    /* Include all the nodes in PFAIL state, so that failure reports are
-     * faster to propagate to go from PFAIL to FAIL state. */
-    int pfail_wanted = server.cluster->stats_pfail_nodes;
-
-    /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen
-     * later according to the number of gossip sections we really were able
-     * to put inside the packet. */
-    estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData);
-    estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted));
-    estlen += writePingExt(NULL, 0);
-    /* Note: clusterBuildMessageHdr() expects the buffer to be always at least
-     * sizeof(clusterMsg) or more. */
-    if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen);
-    clusterMsg *hdr = &msgblock->msg;
-
-    if (!link->inbound && type == CLUSTERMSG_TYPE_PING)
-        link->node->ping_sent = mstime();
-
-    /* Populate the gossip fields */
-    int maxiterations = wanted*3;
-    while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
-        dictEntry *de = dictGetRandomKey(server.cluster->nodes);
-        clusterNode *this = dictGetVal(de);
-
-        /* Don't include this node: the whole packet header is about us
-         * already, so we just gossip about other nodes. */
-        if (this == myself) continue;
-
-        /* PFAIL nodes will be added later. */
-        if (this->flags & CLUSTER_NODE_PFAIL) continue;
-
-        /* In the gossip section don't include:
-         * 1) Nodes in HANDSHAKE state.
-         * 3) Nodes with the NOADDR flag set.
-         * 4) Disconnected nodes if they don't have configured slots.
-         */
-        if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) ||
-            (this->link == NULL && this->numslots == 0))
-        {
-            freshnodes--; /* Technically not correct, but saves CPU. */
-            continue;
-        }
-
-        /* Do not add a node we already have. */
-        if (this->last_in_ping_gossip == cluster_pings_sent) continue;
-
-        /* Add it */
-        clusterSetGossipEntry(hdr,gossipcount,this);
-        this->last_in_ping_gossip = cluster_pings_sent;
-        freshnodes--;
-        gossipcount++;
-    }
-
-    /* If there are PFAIL nodes, add them at the end. */
-    if (pfail_wanted) {
-        dictIterator *di;
-        dictEntry *de;
-
-        di = dictGetSafeIterator(server.cluster->nodes);
-        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
-            clusterNode *node = dictGetVal(de);
-            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
-            if (node->flags & CLUSTER_NODE_NOADDR) continue;
-            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
-            clusterSetGossipEntry(hdr,gossipcount,node);
-            gossipcount++;
-            /* We take the count of the slots we allocated, since the
-             * PFAIL stats may not match perfectly with the current number
-             * of PFAIL nodes. */
-            pfail_wanted--;
-        }
-        dictReleaseIterator(di);
-    }
-
-    /* Compute the actual total length and send! */
-    uint32_t totlen = 0;
-    totlen += writePingExt(hdr, gossipcount);
-    totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
-    serverAssert(gossipcount < USHRT_MAX);
-    hdr->count = htons(gossipcount);
-    hdr->totlen = htonl(totlen);
-
-    clusterSendMessage(link,msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Send a PONG packet to every connected node that's not in handshake state
- * and for which we have a valid link.
- *
- * In Redis Cluster pongs are not used just for failure detection, but also
- * to carry important configuration information. So broadcasting a pong is
- * useful when something changes in the configuration and we want to make
- * the cluster aware ASAP (for instance after a slave promotion).
- *
- * The 'target' argument specifies the receiving instances using the
- * defines below:
- *
- * CLUSTER_BROADCAST_ALL -> All known instances.
- * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring.
- */
-#define CLUSTER_BROADCAST_ALL 0
-#define CLUSTER_BROADCAST_LOCAL_SLAVES 1
-void clusterBroadcastPong(int target) {
-    dictIterator *di;
-    dictEntry *de;
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (!node->link) continue;
-        if (node == myself || nodeInHandshake(node)) continue;
-        if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) {
-            int local_slave =
-                nodeIsSlave(node) && node->slaveof &&
-                (node->slaveof == myself || node->slaveof == myself->slaveof);
-            if (!local_slave) continue;
-        }
-        clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG);
-    }
-    dictReleaseIterator(di);
-}
-
-/* Create a PUBLISH message block.
- *
- * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8.
- * As all the struct is used as a buffer, when more than 8 bytes are copied into
- * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false
- * positive in this context. */
-REDIS_NO_SANITIZE("bounds")
-clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) {
-
-    uint32_t channel_len, message_len;
-
-    channel = getDecodedObject(channel);
-    message = getDecodedObject(message);
-    channel_len = sdslen(channel->ptr);
-    message_len = sdslen(message->ptr);
-
-    size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len;
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen);
-
-    clusterMsg *hdr = &msgblock->msg;
-    hdr->data.publish.msg.channel_len = htonl(channel_len);
-    hdr->data.publish.msg.message_len = htonl(message_len);
-    memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr));
-    memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr),
-        message->ptr,sdslen(message->ptr));
-
-    decrRefCount(channel);
-    decrRefCount(message);
-    
-    return msgblock;
-}
-
-/* Send a FAIL message to all the nodes we are able to contact.
- * The FAIL message is sent when we detect that a node is failing
- * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this:
- * we switch the node state to CLUSTER_NODE_FAIL and ask all the other
- * nodes to do the same ASAP. */
-void clusterSendFail(char *nodename) {
-    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData)
-        + sizeof(clusterMsgDataFail);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen);
-
-    clusterMsg *hdr = &msgblock->msg;
-    memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
-
-    clusterBroadcastMessage(msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Send an UPDATE message to the specified link carrying the specified 'node'
- * slots configuration. The node name, slots bitmap, and configEpoch info
- * are included. */
-void clusterSendUpdate(clusterLink *link, clusterNode *node) {
-    if (link == NULL) return;
-
-    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData)
-        + sizeof(clusterMsgDataUpdate);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen);
-
-    clusterMsg *hdr = &msgblock->msg;
-    memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN);
-    hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
-    memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots));
-    for (unsigned int i = 0; i < sizeof(node->slots); i++) {
-        /* Don't advertise slots that the node stopped claiming */
-        hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]);
-    }
-
-    clusterSendMessage(link,msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Send a MODULE message.
- *
- * If link is NULL, then the message is broadcasted to the whole cluster. */
-void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type,
-                       const char *payload, uint32_t len) {
-    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    msglen += sizeof(clusterMsgModule) - 3 + len;
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen);
-
-    clusterMsg *hdr = &msgblock->msg;
-    hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */
-    hdr->data.module.msg.type = type;
-    hdr->data.module.msg.len = htonl(len);
-    memcpy(hdr->data.module.msg.bulk_data,payload,len);
-
-    if (link)
-        clusterSendMessage(link,msgblock);
-    else
-        clusterBroadcastMessage(msgblock);
-
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* This function gets a cluster node ID string as target, the same way the nodes
- * addresses are represented in the modules side, resolves the node, and sends
- * the message. If the target is NULL the message is broadcasted.
- *
- * The function returns C_OK if the target is valid, otherwise C_ERR is
- * returned. */
-int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) {
-    clusterNode *node = NULL;
-
-    if (target != NULL) {
-        node = clusterLookupNode(target, strlen(target));
-        if (node == NULL || node->link == NULL) return C_ERR;
-    }
-
-    clusterSendModule(target ? node->link : NULL,
-                      module_id, type, payload, len);
-    return C_OK;
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER Pub/Sub support
- *
- * If `sharded` is 0:
- * For now we do very little, just propagating [S]PUBLISH messages across the whole
- * cluster. In the future we'll try to get smarter and avoiding propagating those
- * messages to hosts without receives for a given channel.
- * Otherwise:
- * Publish this message across the slot (primary/replica).
- * -------------------------------------------------------------------------- */
-void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
-    clusterMsgSendBlock *msgblock;
-
-    if (!sharded) {
-        msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH);
-        clusterBroadcastMessage(msgblock);
-        clusterMsgSendBlockDecrRefCount(msgblock);
-        return;
-    }
-
-    listIter li;
-    listNode *ln;
-    list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself);
-    serverAssert(nodes_for_slot != NULL);
-    listRewind(nodes_for_slot, &li);
-    msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD);
-    while((ln = listNext(&li))) {
-        clusterNode *node = listNodeValue(ln);
-        if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
-            continue;
-        clusterSendMessage(node->link,msgblock);
-    }
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* -----------------------------------------------------------------------------
- * SLAVE node specific functions
- * -------------------------------------------------------------------------- */
-
-/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to
- * see if there is the quorum for this slave instance to failover its failing
- * master.
- *
- * Note that we send the failover request to everybody, master and slave nodes,
- * but only the masters are supposed to reply to our query. */
-void clusterRequestFailoverAuth(void) {
-    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen);
-
-    clusterMsg *hdr = &msgblock->msg;
-    /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
-     * in the header to communicate the nodes receiving the message that
-     * they should authorized the failover even if the master is working. */
-    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
-    clusterBroadcastMessage(msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Send a FAILOVER_AUTH_ACK message to the specified node. */
-void clusterSendFailoverAuth(clusterNode *node) {
-    if (!node->link) return;
-
-    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen);
-
-    clusterSendMessage(node->link,msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Send a MFSTART message to the specified node. */
-void clusterSendMFStart(clusterNode *node) {
-    if (!node->link) return;
-
-    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen);
-
-    clusterSendMessage(node->link,msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
-}
-
-/* Vote for the node asking for our vote if there are the conditions. */
-void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
-    clusterNode *master = node->slaveof;
-    uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch);
-    uint64_t requestConfigEpoch = ntohu64(request->configEpoch);
-    unsigned char *claimed_slots = request->myslots;
-    int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK;
-    int j;
-
-    /* IF we are not a master serving at least 1 slot, we don't have the
-     * right to vote, as the cluster size in Redis Cluster is the number
-     * of masters serving at least one slot, and quorum is the cluster
-     * size + 1 */
-    if (nodeIsSlave(myself) || myself->numslots == 0) return;
-
-    /* Request epoch must be >= our currentEpoch.
-     * Note that it is impossible for it to actually be greater since
-     * our currentEpoch was updated as a side effect of receiving this
-     * request, if the request epoch was greater. */
-    if (requestCurrentEpoch < server.cluster->currentEpoch) {
-        serverLog(LL_WARNING,
-            "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)",
-            node->name, node->human_nodename,
-            (unsigned long long) requestCurrentEpoch,
-            (unsigned long long) server.cluster->currentEpoch);
-        return;
-    }
-
-    /* I already voted for this epoch? Return ASAP. */
-    if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) {
-        serverLog(LL_WARNING,
-                "Failover auth denied to %.40s (%s): already voted for epoch %llu",
-                node->name, node->human_nodename,
-                (unsigned long long) server.cluster->currentEpoch);
-        return;
-    }
-
-    /* Node must be a slave and its master down.
-     * The master can be non failing if the request is flagged
-     * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */
-    if (nodeIsMaster(node) || master == NULL ||
-        (!nodeFailed(master) && !force_ack))
-    {
-        if (nodeIsMaster(node)) {
-            serverLog(LL_WARNING,
-                    "Failover auth denied to %.40s (%s): it is a master node",
-                    node->name, node->human_nodename);
-        } else if (master == NULL) {
-            serverLog(LL_WARNING,
-                    "Failover auth denied to %.40s (%s): I don't know its master",
-                    node->name, node->human_nodename);
-        } else if (!nodeFailed(master)) {
-            serverLog(LL_WARNING,
-                    "Failover auth denied to %.40s (%s): its master is up",
-                    node->name, node->human_nodename);
-        }
-        return;
-    }
-
-    /* We did not voted for a slave about this master for two
-     * times the node timeout. This is not strictly needed for correctness
-     * of the algorithm but makes the base case more linear. */
-    if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2)
-    {
-        serverLog(LL_WARNING,
-                "Failover auth denied to %.40s %s: "
-                "can't vote about this master before %lld milliseconds",
-                node->name, node->human_nodename,
-                (long long) ((server.cluster_node_timeout*2)-
-                             (mstime() - node->slaveof->voted_time)));
-        return;
-    }
-
-    /* The slave requesting the vote must have a configEpoch for the claimed
-     * slots that is >= the one of the masters currently serving the same
-     * slots in the current configuration. */
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (bitmapTestBit(claimed_slots, j) == 0) continue;
-        if (isSlotUnclaimed(j) ||
-            server.cluster->slots[j]->configEpoch <= requestConfigEpoch)
-        {
-            continue;
-        }
-        /* If we reached this point we found a slot that in our current slots
-         * is served by a master with a greater configEpoch than the one claimed
-         * by the slave requesting our vote. Refuse to vote for this slave. */
-        serverLog(LL_WARNING,
-                "Failover auth denied to %.40s (%s): "
-                "slot %d epoch (%llu) > reqEpoch (%llu)",
-                node->name, node->human_nodename, j,
-                (unsigned long long) server.cluster->slots[j]->configEpoch,
-                (unsigned long long) requestConfigEpoch);
-        return;
-    }
-
-    /* We can vote for this slave. */
-    server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
-    node->slaveof->voted_time = mstime();
-    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG);
-    clusterSendFailoverAuth(node);
-    serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu",
-        node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch);
-}
-
-/* This function returns the "rank" of this instance, a slave, in the context
- * of its master-slaves ring. The rank of the slave is given by the number of
- * other slaves for the same master that have a better replication offset
- * compared to the local one (better means, greater, so they claim more data).
- *
- * A slave with rank 0 is the one with the greatest (most up to date)
- * replication offset, and so forth. Note that because how the rank is computed
- * multiple slaves may have the same rank, in case they have the same offset.
- *
- * The slave rank is used to add a delay to start an election in order to
- * get voted and replace a failing master. Slaves with better replication
- * offsets are more likely to win. */
-int clusterGetSlaveRank(void) {
-    long long myoffset;
-    int j, rank = 0;
-    clusterNode *master;
-
-    serverAssert(nodeIsSlave(myself));
-    master = myself->slaveof;
-    if (master == NULL) return 0; /* Never called by slaves without master. */
-
-    myoffset = replicationGetSlaveOffset();
-    for (j = 0; j < master->numslaves; j++)
-        if (master->slaves[j] != myself &&
-            !nodeCantFailover(master->slaves[j]) &&
-            master->slaves[j]->repl_offset > myoffset) rank++;
-    return rank;
-}
-
-/* This function is called by clusterHandleSlaveFailover() in order to
- * let the slave log why it is not able to failover. Sometimes there are
- * not the conditions, but since the failover function is called again and
- * again, we can't log the same things continuously.
- *
- * This function works by logging only if a given set of conditions are
- * true:
- *
- * 1) The reason for which the failover can't be initiated changed.
- *    The reasons also include a NONE reason we reset the state to
- *    when the slave finds that its master is fine (no FAIL flag).
- * 2) Also, the log is emitted again if the master is still down and
- *    the reason for not failing over is still the same, but more than
- *    CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
- * 3) Finally, the function only logs if the slave is down for more than
- *    five seconds + NODE_TIMEOUT. This way nothing is logged when a
- *    failover starts in a reasonable time.
- *
- * The function is called with the reason why the slave can't failover
- * which is one of the integer macros CLUSTER_CANT_FAILOVER_*.
- *
- * The function is guaranteed to be called only if 'myself' is a slave. */
-void clusterLogCantFailover(int reason) {
-    char *msg;
-    static time_t lastlog_time = 0;
-    mstime_t nolog_fail_time = server.cluster_node_timeout + 5000;
-
-    /* Don't log if we have the same reason for some time. */
-    if (reason == server.cluster->cant_failover_reason &&
-        time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD)
-        return;
-
-    server.cluster->cant_failover_reason = reason;
-
-    /* We also don't emit any log if the master failed no long ago, the
-     * goal of this function is to log slaves in a stalled condition for
-     * a long time. */
-    if (myself->slaveof &&
-        nodeFailed(myself->slaveof) &&
-        (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return;
-
-    switch(reason) {
-    case CLUSTER_CANT_FAILOVER_DATA_AGE:
-        msg = "Disconnected from master for longer than allowed. "
-              "Please check the 'cluster-replica-validity-factor' configuration "
-              "option.";
-        break;
-    case CLUSTER_CANT_FAILOVER_WAITING_DELAY:
-        msg = "Waiting the delay before I can start a new failover.";
-        break;
-    case CLUSTER_CANT_FAILOVER_EXPIRED:
-        msg = "Failover attempt expired.";
-        break;
-    case CLUSTER_CANT_FAILOVER_WAITING_VOTES:
-        msg = "Waiting for votes, but majority still not reached.";
-        break;
-    default:
-        msg = "Unknown reason code.";
-        break;
-    }
-    lastlog_time = time(NULL);
-    serverLog(LL_NOTICE,"Currently unable to failover: %s", msg);
-    
-    int cur_vote = server.cluster->failover_auth_count;
-    int cur_quorum = (server.cluster->size / 2) + 1;
-    /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */
-    if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) {
-        serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote);
-    } 
-}
-
-/* This function implements the final part of automatic and manual failovers,
- * where the slave grabs its master's hash slots, and propagates the new
- * configuration.
- *
- * Note that it's up to the caller to be sure that the node got a new
- * configuration epoch already. */
-void clusterFailoverReplaceYourMaster(void) {
-    int j;
-    clusterNode *oldmaster = myself->slaveof;
-
-    if (nodeIsMaster(myself) || oldmaster == NULL) return;
-
-    /* 1) Turn this node into a master. */
-    clusterSetNodeAsMaster(myself);
-    replicationUnsetMaster();
-
-    /* 2) Claim all the slots assigned to our master. */
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (clusterNodeGetSlotBit(oldmaster,j)) {
-            clusterDelSlot(j);
-            clusterAddSlot(myself,j);
-        }
-    }
-
-    /* 3) Update state and save config. */
-    clusterUpdateState();
-    clusterSaveConfigOrDie(1);
-
-    /* 4) Pong all the other nodes so that they can update the state
-     *    accordingly and detect that we switched to master role. */
-    clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
-
-    /* 5) If there was a manual failover in progress, clear the state. */
-    resetManualFailover();
-}
-
-/* This function is called if we are a slave node and our master serving
- * a non-zero amount of hash slots is in FAIL state.
- *
- * The goal of this function is:
- * 1) To check if we are able to perform a failover, is our data updated?
- * 2) Try to get elected by masters.
- * 3) Perform the failover informing all the other nodes.
- */
-void clusterHandleSlaveFailover(void) {
-    mstime_t data_age;
-    mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
-    int needed_quorum = (server.cluster->size / 2) + 1;
-    int manual_failover = server.cluster->mf_end != 0 &&
-                          server.cluster->mf_can_start;
-    mstime_t auth_timeout, auth_retry_time;
-
-    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER;
-
-    /* Compute the failover timeout (the max time we have to send votes
-     * and wait for replies), and the failover retry time (the time to wait
-     * before trying to get voted again).
-     *
-     * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds.
-     * Retry is two times the Timeout.
-     */
-    auth_timeout = server.cluster_node_timeout*2;
-    if (auth_timeout < 2000) auth_timeout = 2000;
-    auth_retry_time = auth_timeout*2;
-
-    /* Pre conditions to run the function, that must be met both in case
-     * of an automatic or manual failover:
-     * 1) We are a slave.
-     * 2) Our master is flagged as FAIL, or this is a manual failover.
-     * 3) We don't have the no failover configuration set, and this is
-     *    not a manual failover.
-     * 4) It is serving slots. */
-    if (nodeIsMaster(myself) ||
-        myself->slaveof == NULL ||
-        (!nodeFailed(myself->slaveof) && !manual_failover) ||
-        (server.cluster_slave_no_failover && !manual_failover) ||
-        myself->slaveof->numslots == 0)
-    {
-        /* There are no reasons to failover, so we set the reason why we
-         * are returning without failing over to NONE. */
-        server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
-        return;
-    }
-
-    /* Set data_age to the number of milliseconds we are disconnected from
-     * the master. */
-    if (server.repl_state == REPL_STATE_CONNECTED) {
-        data_age = (mstime_t)(server.unixtime - server.master->lastinteraction)
-                   * 1000;
-    } else {
-        data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000;
-    }
-
-    /* Remove the node timeout from the data age as it is fine that we are
-     * disconnected from our master at least for the time it was down to be
-     * flagged as FAIL, that's the baseline. */
-    if (data_age > server.cluster_node_timeout)
-        data_age -= server.cluster_node_timeout;
-
-    /* Check if our data is recent enough according to the slave validity
-     * factor configured by the user.
-     *
-     * Check bypassed for manual failovers. */
-    if (server.cluster_slave_validity_factor &&
-        data_age >
-        (((mstime_t)server.repl_ping_slave_period * 1000) +
-         (server.cluster_node_timeout * server.cluster_slave_validity_factor)))
-    {
-        if (!manual_failover) {
-            clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
-            return;
-        }
-    }
-
-    /* If the previous failover attempt timeout and the retry time has
-     * elapsed, we can setup a new one. */
-    if (auth_age > auth_retry_time) {
-        server.cluster->failover_auth_time = mstime() +
-            500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
-            random() % 500; /* Random delay between 0 and 500 milliseconds. */
-        server.cluster->failover_auth_count = 0;
-        server.cluster->failover_auth_sent = 0;
-        server.cluster->failover_auth_rank = clusterGetSlaveRank();
-        /* We add another delay that is proportional to the slave rank.
-         * Specifically 1 second * rank. This way slaves that have a probably
-         * less updated replication offset, are penalized. */
-        server.cluster->failover_auth_time +=
-            server.cluster->failover_auth_rank * 1000;
-        /* However if this is a manual failover, no delay is needed. */
-        if (server.cluster->mf_end) {
-            server.cluster->failover_auth_time = mstime();
-            server.cluster->failover_auth_rank = 0;
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
-        }
-        serverLog(LL_NOTICE,
-            "Start of election delayed for %lld milliseconds "
-            "(rank #%d, offset %lld).",
-            server.cluster->failover_auth_time - mstime(),
-            server.cluster->failover_auth_rank,
-            replicationGetSlaveOffset());
-        /* Now that we have a scheduled election, broadcast our offset
-         * to all the other slaves so that they'll updated their offsets
-         * if our offset is better. */
-        clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES);
-        return;
-    }
-
-    /* It is possible that we received more updated offsets from other
-     * slaves for the same master since we computed our election delay.
-     * Update the delay if our rank changed.
-     *
-     * Not performed if this is a manual failover. */
-    if (server.cluster->failover_auth_sent == 0 &&
-        server.cluster->mf_end == 0)
-    {
-        int newrank = clusterGetSlaveRank();
-        if (newrank > server.cluster->failover_auth_rank) {
-            long long added_delay =
-                (newrank - server.cluster->failover_auth_rank) * 1000;
-            server.cluster->failover_auth_time += added_delay;
-            server.cluster->failover_auth_rank = newrank;
-            serverLog(LL_NOTICE,
-                "Replica rank updated to #%d, added %lld milliseconds of delay.",
-                newrank, added_delay);
-        }
-    }
-
-    /* Return ASAP if we can't still start the election. */
-    if (mstime() < server.cluster->failover_auth_time) {
-        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
-        return;
-    }
-
-    /* Return ASAP if the election is too old to be valid. */
-    if (auth_age > auth_timeout) {
-        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED);
-        return;
-    }
-
-    /* Ask for votes if needed. */
-    if (server.cluster->failover_auth_sent == 0) {
-        server.cluster->currentEpoch++;
-        server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
-        serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.",
-            (unsigned long long) server.cluster->currentEpoch);
-        clusterRequestFailoverAuth();
-        server.cluster->failover_auth_sent = 1;
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
-                             CLUSTER_TODO_UPDATE_STATE|
-                             CLUSTER_TODO_FSYNC_CONFIG);
-        return; /* Wait for replies. */
-    }
-
-    /* Check if we reached the quorum. */
-    if (server.cluster->failover_auth_count >= needed_quorum) {
-        /* We have the quorum, we can finally failover the master. */
-
-        serverLog(LL_NOTICE,
-            "Failover election won: I'm the new master.");
-
-        /* Update my configEpoch to the epoch of the election. */
-        if (myself->configEpoch < server.cluster->failover_auth_epoch) {
-            myself->configEpoch = server.cluster->failover_auth_epoch;
-            serverLog(LL_NOTICE,
-                "configEpoch set to %llu after successful failover",
-                (unsigned long long) myself->configEpoch);
-        }
-
-        /* Take responsibility for the cluster slots. */
-        clusterFailoverReplaceYourMaster();
-    } else {
-        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES);
-    }
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER slave migration
- *
- * Slave migration is the process that allows a slave of a master that is
- * already covered by at least another slave, to "migrate" to a master that
- * is orphaned, that is, left with no working slaves.
- * ------------------------------------------------------------------------- */
-
-/* This function is responsible to decide if this replica should be migrated
- * to a different (orphaned) master. It is called by the clusterCron() function
- * only if:
- *
- * 1) We are a slave node.
- * 2) It was detected that there is at least one orphaned master in
- *    the cluster.
- * 3) We are a slave of one of the masters with the greatest number of
- *    slaves.
- *
- * This checks are performed by the caller since it requires to iterate
- * the nodes anyway, so we spend time into clusterHandleSlaveMigration()
- * if definitely needed.
- *
- * The function is called with a pre-computed max_slaves, that is the max
- * number of working (not in FAIL state) slaves for a single master.
- *
- * Additional conditions for migration are examined inside the function.
- */
-void clusterHandleSlaveMigration(int max_slaves) {
-    int j, okslaves = 0;
-    clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL;
-    dictIterator *di;
-    dictEntry *de;
-
-    /* Step 1: Don't migrate if the cluster state is not ok. */
-    if (server.cluster->state != CLUSTER_OK) return;
-
-    /* Step 2: Don't migrate if my master will not be left with at least
-     *         'migration-barrier' slaves after my migration. */
-    if (mymaster == NULL) return;
-    for (j = 0; j < mymaster->numslaves; j++)
-        if (!nodeFailed(mymaster->slaves[j]) &&
-            !nodeTimedOut(mymaster->slaves[j])) okslaves++;
-    if (okslaves <= server.cluster_migration_barrier) return;
-
-    /* Step 3: Identify a candidate for migration, and check if among the
-     * masters with the greatest number of ok slaves, I'm the one with the
-     * smallest node ID (the "candidate slave").
-     *
-     * Note: this means that eventually a replica migration will occur
-     * since slaves that are reachable again always have their FAIL flag
-     * cleared, so eventually there must be a candidate.
-     * There is a possible race condition causing multiple
-     * slaves to migrate at the same time, but this is unlikely to
-     * happen and relatively harmless when it does. */
-    candidate = myself;
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        int okslaves = 0, is_orphaned = 1;
-
-        /* We want to migrate only if this master is working, orphaned, and
-         * used to have slaves or if failed over a master that had slaves
-         * (MIGRATE_TO flag). This way we only migrate to instances that were
-         * supposed to have replicas. */
-        if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0;
-        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0;
-
-        /* Check number of working slaves. */
-        if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node);
-        if (okslaves > 0) is_orphaned = 0;
-
-        if (is_orphaned) {
-            if (!target && node->numslots > 0) target = node;
-
-            /* Track the starting time of the orphaned condition for this
-             * master. */
-            if (!node->orphaned_time) node->orphaned_time = mstime();
-        } else {
-            node->orphaned_time = 0;
-        }
-
-        /* Check if I'm the slave candidate for the migration: attached
-         * to a master with the maximum number of slaves and with the smallest
-         * node ID. */
-        if (okslaves == max_slaves) {
-            for (j = 0; j < node->numslaves; j++) {
-                if (memcmp(node->slaves[j]->name,
-                           candidate->name,
-                           CLUSTER_NAMELEN) < 0)
-                {
-                    candidate = node->slaves[j];
-                }
-            }
-        }
-    }
-    dictReleaseIterator(di);
-
-    /* Step 4: perform the migration if there is a target, and if I'm the
-     * candidate, but only if the master is continuously orphaned for a
-     * couple of seconds, so that during failovers, we give some time to
-     * the natural slaves of this instance to advertise their switch from
-     * the old master to the new one. */
-    if (target && candidate == myself &&
-        (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY &&
-       !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
-    {
-        serverLog(LL_NOTICE,"Migrating to orphaned master %.40s",
-            target->name);
-        clusterSetMaster(target);
-    }
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER manual failover
- *
- * This are the important steps performed by slaves during a manual failover:
- * 1) User send CLUSTER FAILOVER command. The failover state is initialized
- *    setting mf_end to the millisecond unix time at which we'll abort the
- *    attempt.
- * 2) Slave sends a MFSTART message to the master requesting to pause clients
- *    for two times the manual failover timeout CLUSTER_MF_TIMEOUT.
- *    When master is paused for manual failover, it also starts to flag
- *    packets with CLUSTERMSG_FLAG0_PAUSED.
- * 3) Slave waits for master to send its replication offset flagged as PAUSED.
- * 4) If slave received the offset from the master, and its offset matches,
- *    mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform
- *    the failover as usually, with the difference that the vote request
- *    will be modified to force masters to vote for a slave that has a
- *    working master.
- *
- * From the point of view of the master things are simpler: when a
- * PAUSE_CLIENTS packet is received the master sets mf_end as well and
- * the sender in mf_slave. During the time limit for the manual failover
- * the master will just send PINGs more often to this slave, flagged with
- * the PAUSED flag, so that the slave will set mf_master_offset when receiving
- * a packet from the master with this flag set.
- *
- * The goal of the manual failover is to perform a fast failover without
- * data loss due to the asynchronous master-slave replication.
- * -------------------------------------------------------------------------- */
-
-/* Reset the manual failover state. This works for both masters and slaves
- * as all the state about manual failover is cleared.
- *
- * The function can be used both to initialize the manual failover state at
- * startup or to abort a manual failover in progress. */
-void resetManualFailover(void) {
-    if (server.cluster->mf_slave) {
-        /* We were a master failing over, so we paused clients and related actions.
-         * Regardless of the outcome we unpause now to allow traffic again. */
-        unpauseActions(PAUSE_DURING_FAILOVER);
-    }
-    server.cluster->mf_end = 0; /* No manual failover in progress. */
-    server.cluster->mf_can_start = 0;
-    server.cluster->mf_slave = NULL;
-    server.cluster->mf_master_offset = -1;
-}
-
-/* If a manual failover timed out, abort it. */
-void manualFailoverCheckTimeout(void) {
-    if (server.cluster->mf_end && server.cluster->mf_end < mstime()) {
-        serverLog(LL_WARNING,"Manual failover timed out.");
-        resetManualFailover();
-    }
-}
-
-/* This function is called from the cluster cron function in order to go
- * forward with a manual failover state machine. */
-void clusterHandleManualFailover(void) {
-    /* Return ASAP if no manual failover is in progress. */
-    if (server.cluster->mf_end == 0) return;
-
-    /* If mf_can_start is non-zero, the failover was already triggered so the
-     * next steps are performed by clusterHandleSlaveFailover(). */
-    if (server.cluster->mf_can_start) return;
-
-    if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */
-
-    if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
-        /* Our replication offset matches the master replication offset
-         * announced after clients were paused. We can start the failover. */
-        server.cluster->mf_can_start = 1;
-        serverLog(LL_NOTICE,
-            "All master replication stream processed, "
-            "manual failover can start.");
-        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
-        return;
-    }
-    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER cron job
- * -------------------------------------------------------------------------- */
-
-/* Check if the node is disconnected and re-establish the connection.
- * Also update a few stats while we are here, that can be used to make
- * better decisions in other part of the code. */
-static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) {
-    /* Not interested in reconnecting the link with myself or nodes
-     * for which we have no address. */
-    if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1;
-
-    if (node->flags & CLUSTER_NODE_PFAIL)
-        server.cluster->stats_pfail_nodes++;
-
-    /* A Node in HANDSHAKE state has a limited lifespan equal to the
-     * configured node timeout. */
-    if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
-        clusterDelNode(node);
-        return 1;
-    }
-
-    if (node->link == NULL) {
-        clusterLink *link = createClusterLink(node);
-        link->conn = connCreate(connTypeOfCluster());
-        connSetPrivateData(link->conn, link);
-        if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr,
-                    clusterLinkConnectHandler) == C_ERR) {
-            /* We got a synchronous error from connect before
-             * clusterSendPing() had a chance to be called.
-             * If node->ping_sent is zero, failure detection can't work,
-             * so we claim we actually sent a ping now (that will
-             * be really sent as soon as the link is obtained). */
-            if (node->ping_sent == 0) node->ping_sent = mstime();
-            serverLog(LL_DEBUG, "Unable to connect to "
-                "Cluster Node [%s]:%d -> %s", node->ip,
-                node->cport, server.neterr);
-
-            freeClusterLink(link);
-            return 0;
-        }
-    }
-    return 0;
-}
-
-static void freeClusterLinkOnBufferLimitReached(clusterLink *link) {
-    if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) {
-        return;
-    }
-
-    unsigned long long mem_link = link->send_msg_queue_mem;
-    if (mem_link > server.cluster_link_msg_queue_limit_bytes) {
-        serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to "
-                "exceeding send buffer memory limit.", link->inbound ? "from" : "to",
-                link->node ? link->node->name : "", mem_link);
-        freeClusterLink(link);
-        server.cluster->stat_cluster_links_buffer_limit_exceeded++;
-    }
-}
-
-/* Free outbound link to a node if its send buffer size exceeded limit. */
-static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) {
-    freeClusterLinkOnBufferLimitReached(node->link);
-    freeClusterLinkOnBufferLimitReached(node->inbound_link);
-}
-
-/* This is executed 10 times every second */
-void clusterCron(void) {
-    dictIterator *di;
-    dictEntry *de;
-    int update_state = 0;
-    int orphaned_masters; /* How many masters there are without ok slaves. */
-    int max_slaves; /* Max number of ok slaves for a single master. */
-    int this_slaves; /* Number of ok slaves for our master (if we are slave). */
-    mstime_t min_pong = 0, now = mstime();
-    clusterNode *min_pong_node = NULL;
-    static unsigned long long iteration = 0;
-    mstime_t handshake_timeout;
-
-    iteration++; /* Number of times this function was called so far. */
-
-    clusterUpdateMyselfHostname();
-
-    /* The handshake timeout is the time after which a handshake node that was
-     * not turned into a normal node is removed from the nodes. Usually it is
-     * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
-     * the value of 1 second. */
-    handshake_timeout = server.cluster_node_timeout;
-    if (handshake_timeout < 1000) handshake_timeout = 1000;
-
-    /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */
-    server.cluster->stats_pfail_nodes = 0;
-    /* Run through some of the operations we want to do on each cluster node. */
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        /* We free the inbound or outboud link to the node if the link has an
-         * oversized message send queue and immediately try reconnecting. */
-        clusterNodeCronFreeLinkOnBufferLimitReached(node);
-        /* The protocol is that function(s) below return non-zero if the node was
-         * terminated.
-         */
-        if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue;
-    }
-    dictReleaseIterator(di); 
-
-    /* Ping some random node 1 time every 10 iterations, so that we usually ping
-     * one random node every second. */
-    if (!(iteration % 10)) {
-        int j;
-
-        /* Check a few random nodes and ping the one with the oldest
-         * pong_received time. */
-        for (j = 0; j < 5; j++) {
-            de = dictGetRandomKey(server.cluster->nodes);
-            clusterNode *this = dictGetVal(de);
-
-            /* Don't ping nodes disconnected or with a ping currently active. */
-            if (this->link == NULL || this->ping_sent != 0) continue;
-            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
-                continue;
-            if (min_pong_node == NULL || min_pong > this->pong_received) {
-                min_pong_node = this;
-                min_pong = this->pong_received;
-            }
-        }
-        if (min_pong_node) {
-            serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
-            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
-        }
-    }
-
-    /* Iterate nodes to check if we need to flag something as failing.
-     * This loop is also responsible to:
-     * 1) Check if there are orphaned masters (masters without non failing
-     *    slaves).
-     * 2) Count the max number of non failing slaves for a single master.
-     * 3) Count the number of slaves for our master, if we are a slave. */
-    orphaned_masters = 0;
-    max_slaves = 0;
-    this_slaves = 0;
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        now = mstime(); /* Use an updated time at every iteration. */
-
-        if (node->flags &
-            (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
-                continue;
-
-        /* Orphaned master check, useful only if the current instance
-         * is a slave that may migrate to another master. */
-        if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) {
-            int okslaves = clusterCountNonFailingSlaves(node);
-
-            /* A master is orphaned if it is serving a non-zero number of
-             * slots, have no working slaves, but used to have at least one
-             * slave, or failed over a master that used to have slaves. */
-            if (okslaves == 0 && node->numslots > 0 &&
-                node->flags & CLUSTER_NODE_MIGRATE_TO)
-            {
-                orphaned_masters++;
-            }
-            if (okslaves > max_slaves) max_slaves = okslaves;
-            if (myself->slaveof == node)
-                this_slaves = okslaves;
-        }
-
-        /* If we are not receiving any data for more than half the cluster
-         * timeout, reconnect the link: maybe there is a connection
-         * issue even if the node is alive. */
-        mstime_t ping_delay = now - node->ping_sent;
-        mstime_t data_delay = now - node->data_received;
-        if (node->link && /* is connected */
-            now - node->link->ctime >
-            server.cluster_node_timeout && /* was not already reconnected */
-            node->ping_sent && /* we already sent a ping */
-            /* and we are waiting for the pong more than timeout/2 */
-            ping_delay > server.cluster_node_timeout/2 &&
-            /* and in such interval we are not seeing any traffic at all. */
-            data_delay > server.cluster_node_timeout/2)
-        {
-            /* Disconnect the link, it will be reconnected automatically. */
-            freeClusterLink(node->link);
-        }
-
-        /* If we have currently no active ping in this instance, and the
-         * received PONG is older than half the cluster timeout, send
-         * a new ping now, to ensure all the nodes are pinged without
-         * a too big delay. */
-        mstime_t ping_interval = server.cluster_ping_interval ? 
-            server.cluster_ping_interval : server.cluster_node_timeout/2;
-        if (node->link &&
-            node->ping_sent == 0 &&
-            (now - node->pong_received) > ping_interval)
-        {
-            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
-            continue;
-        }
-
-        /* If we are a master and one of the slaves requested a manual
-         * failover, ping it continuously. */
-        if (server.cluster->mf_end &&
-            nodeIsMaster(myself) &&
-            server.cluster->mf_slave == node &&
-            node->link)
-        {
-            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
-            continue;
-        }
-
-        /* Check only if we have an active ping for this instance. */
-        if (node->ping_sent == 0) continue;
-
-        /* Check if this node looks unreachable.
-         * Note that if we already received the PONG, then node->ping_sent
-         * is zero, so can't reach this code at all, so we don't risk of
-         * checking for a PONG delay if we didn't sent the PING.
-         *
-         * We also consider every incoming data as proof of liveness, since
-         * our cluster bus link is also used for data: under heavy data
-         * load pong delays are possible. */
-        mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
-                                                          data_delay;
-
-        if (node_delay > server.cluster_node_timeout) {
-            /* Timeout reached. Set the node as possibly failing if it is
-             * not already in this state. */
-            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
-                serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
-                    node->name);
-                node->flags |= CLUSTER_NODE_PFAIL;
-                update_state = 1;
-            }
-        }
-    }
-    dictReleaseIterator(di);
-
-    /* If we are a slave node but the replication is still turned off,
-     * enable it if we know the address of our master and it appears to
-     * be up. */
-    if (nodeIsSlave(myself) &&
-        server.masterhost == NULL &&
-        myself->slaveof &&
-        nodeHasAddr(myself->slaveof))
-    {
-        replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof));
-    }
-
-    /* Abort a manual failover if the timeout is reached. */
-    manualFailoverCheckTimeout();
-
-    if (nodeIsSlave(myself)) {
-        clusterHandleManualFailover();
-        if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
-            clusterHandleSlaveFailover();
-        /* If there are orphaned slaves, and we are a slave among the masters
-         * with the max number of non-failing slaves, consider migrating to
-         * the orphaned masters. Note that it does not make sense to try
-         * a migration if there is no master with at least *two* working
-         * slaves. */
-        if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves &&
-            server.cluster_allow_replica_migration)
-            clusterHandleSlaveMigration(max_slaves);
-    }
-
-    if (update_state || server.cluster->state == CLUSTER_FAIL)
-        clusterUpdateState();
-}
-
-/* This function is called before the event handler returns to sleep for
- * events. It is useful to perform operations that must be done ASAP in
- * reaction to events fired but that are not safe to perform inside event
- * handlers, or to perform potentially expansive tasks that we need to do
- * a single time before replying to clients. */
-void clusterBeforeSleep(void) {
-    int flags = server.cluster->todo_before_sleep;
-
-    /* Reset our flags (not strictly needed since every single function
-     * called for flags set should be able to clear its flag). */
-    server.cluster->todo_before_sleep = 0;
-
-    if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
-        /* Handle manual failover as soon as possible so that won't have a 100ms
-         * as it was handled only in clusterCron */
-        if(nodeIsSlave(myself)) {
-            clusterHandleManualFailover();
-            if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
-                clusterHandleSlaveFailover();
-        }
-    } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
-        /* Handle failover, this is needed when it is likely that there is already
-         * the quorum from masters in order to react fast. */
-        clusterHandleSlaveFailover();
-    }
-
-    /* Update the cluster state. */
-    if (flags & CLUSTER_TODO_UPDATE_STATE)
-        clusterUpdateState();
-
-    /* Save the config, possibly using fsync. */
-    if (flags & CLUSTER_TODO_SAVE_CONFIG) {
-        int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
-        clusterSaveConfigOrDie(fsync);
-    }
-}
-
-void clusterDoBeforeSleep(int flags) {
-    server.cluster->todo_before_sleep |= flags;
-}
-
-/* -----------------------------------------------------------------------------
- * Slots management
- * -------------------------------------------------------------------------- */
-
-/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set,
- * otherwise 0. */
-int bitmapTestBit(unsigned char *bitmap, int pos) {
-    off_t byte = pos/8;
-    int bit = pos&7;
-    return (bitmap[byte] & (1<<bit)) != 0;
-}
-
-/* Set the bit at position 'pos' in a bitmap. */
-void bitmapSetBit(unsigned char *bitmap, int pos) {
-    off_t byte = pos/8;
-    int bit = pos&7;
-    bitmap[byte] |= 1<<bit;
-}
-
-/* Clear the bit at position 'pos' in a bitmap. */
-void bitmapClearBit(unsigned char *bitmap, int pos) {
-    off_t byte = pos/8;
-    int bit = pos&7;
-    bitmap[byte] &= ~(1<<bit);
-}
-
-/* Return non-zero if there is at least one master with slaves in the cluster.
- * Otherwise zero is returned. Used by clusterNodeSetSlotBit() to set the
- * MIGRATE_TO flag the when a master gets the first slot. */
-int clusterMastersHaveSlaves(void) {
-    dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
-    dictEntry *de;
-    int slaves = 0;
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (nodeIsSlave(node)) continue;
-        slaves += node->numslaves;
-    }
-    dictReleaseIterator(di);
-    return slaves != 0;
-}
-
-/* Set the slot bit and return the old value. */
-int clusterNodeSetSlotBit(clusterNode *n, int slot) {
-    int old = bitmapTestBit(n->slots,slot);
-    if (!old) {
-        bitmapSetBit(n->slots,slot);
-        n->numslots++;
-        /* When a master gets its first slot, even if it has no slaves,
-         * it gets flagged with MIGRATE_TO, that is, the master is a valid
-         * target for replicas migration, if and only if at least one of
-         * the other masters has slaves right now.
-         *
-         * Normally masters are valid targets of replica migration if:
-         * 1. The used to have slaves (but no longer have).
-         * 2. They are slaves failing over a master that used to have slaves.
-         *
-         * However new masters with slots assigned are considered valid
-         * migration targets if the rest of the cluster is not a slave-less.
-         *
-         * See https://github.com/redis/redis/issues/3043 for more info. */
-        if (n->numslots == 1 && clusterMastersHaveSlaves())
-            n->flags |= CLUSTER_NODE_MIGRATE_TO;
-    }
-    return old;
-}
-
-/* Clear the slot bit and return the old value. */
-int clusterNodeClearSlotBit(clusterNode *n, int slot) {
-    int old = bitmapTestBit(n->slots,slot);
-    if (old) {
-        bitmapClearBit(n->slots,slot);
-        n->numslots--;
-    }
-    return old;
-}
-
-/* Return the slot bit from the cluster node structure. */
-int clusterNodeGetSlotBit(clusterNode *n, int slot) {
-    return bitmapTestBit(n->slots,slot);
-}
-
-/* Add the specified slot to the list of slots that node 'n' will
- * serve. Return C_OK if the operation ended with success.
- * If the slot is already assigned to another instance this is considered
- * an error and C_ERR is returned. */
-int clusterAddSlot(clusterNode *n, int slot) {
-    if (server.cluster->slots[slot]) return C_ERR;
-    clusterNodeSetSlotBit(n,slot);
-    server.cluster->slots[slot] = n;
-    return C_OK;
-}
-
-/* Delete the specified slot marking it as unassigned.
- * Returns C_OK if the slot was assigned, otherwise if the slot was
- * already unassigned C_ERR is returned. */
-int clusterDelSlot(int slot) {
-    clusterNode *n = server.cluster->slots[slot];
-
-    if (!n) return C_ERR;
-
-    /* Cleanup the channels in master/replica as part of slot deletion. */
-    removeChannelsInSlot(slot);
-    /* Clear the slot bit. */
-    serverAssert(clusterNodeClearSlotBit(n,slot) == 1);
-    server.cluster->slots[slot] = NULL;
-    /* Make owner_not_claiming_slot flag consistent with slot ownership information. */
-    bitmapClearBit(server.cluster->owner_not_claiming_slot, slot);
-    return C_OK;
-}
-
-/* Delete all the slots associated with the specified node.
- * The number of deleted slots is returned. */
-int clusterDelNodeSlots(clusterNode *node) {
-    int deleted = 0, j;
-
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (clusterNodeGetSlotBit(node,j)) {
-            clusterDelSlot(j);
-            deleted++;
-        }
-    }
-    return deleted;
-}
-
-/* Clear the migrating / importing state for all the slots.
- * This is useful at initialization and when turning a master into slave. */
-void clusterCloseAllSlots(void) {
-    memset(server.cluster->migrating_slots_to,0,
-        sizeof(server.cluster->migrating_slots_to));
-    memset(server.cluster->importing_slots_from,0,
-        sizeof(server.cluster->importing_slots_from));
-}
-
-/* -----------------------------------------------------------------------------
- * Cluster state evaluation function
- * -------------------------------------------------------------------------- */
-
-/* The following are defines that are only used in the evaluation function
- * and are based on heuristics. Actually the main point about the rejoin and
- * writable delay is that they should be a few orders of magnitude larger
- * than the network latency. */
-#define CLUSTER_MAX_REJOIN_DELAY 5000
-#define CLUSTER_MIN_REJOIN_DELAY 500
-#define CLUSTER_WRITABLE_DELAY 2000
-
-void clusterUpdateState(void) {
-    int j, new_state;
-    int reachable_masters = 0;
-    static mstime_t among_minority_time;
-    static mstime_t first_call_time = 0;
-
-    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE;
-
-    /* If this is a master node, wait some time before turning the state
-     * into OK, since it is not a good idea to rejoin the cluster as a writable
-     * master, after a reboot, without giving the cluster a chance to
-     * reconfigure this node. Note that the delay is calculated starting from
-     * the first call to this function and not since the server start, in order
-     * to not count the DB loading time. */
-    if (first_call_time == 0) first_call_time = mstime();
-    if (nodeIsMaster(myself) &&
-        server.cluster->state == CLUSTER_FAIL &&
-        mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return;
-
-    /* Start assuming the state is OK. We'll turn it into FAIL if there
-     * are the right conditions. */
-    new_state = CLUSTER_OK;
-
-    /* Check if all the slots are covered. */
-    if (server.cluster_require_full_coverage) {
-        for (j = 0; j < CLUSTER_SLOTS; j++) {
-            if (server.cluster->slots[j] == NULL ||
-                server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL))
-            {
-                new_state = CLUSTER_FAIL;
-                break;
-            }
-        }
-    }
-
-    /* Compute the cluster size, that is the number of master nodes
-     * serving at least a single slot.
-     *
-     * At the same time count the number of reachable masters having
-     * at least one slot. */
-    {
-        dictIterator *di;
-        dictEntry *de;
-
-        server.cluster->size = 0;
-        di = dictGetSafeIterator(server.cluster->nodes);
-        while((de = dictNext(di)) != NULL) {
-            clusterNode *node = dictGetVal(de);
-
-            if (nodeIsMaster(node) && node->numslots) {
-                server.cluster->size++;
-                if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0)
-                    reachable_masters++;
-            }
-        }
-        dictReleaseIterator(di);
-    }
-
-    /* If we are in a minority partition, change the cluster state
-     * to FAIL. */
-    {
-        int needed_quorum = (server.cluster->size / 2) + 1;
-
-        if (reachable_masters < needed_quorum) {
-            new_state = CLUSTER_FAIL;
-            among_minority_time = mstime();
-        }
-    }
-
-    /* Log a state change */
-    if (new_state != server.cluster->state) {
-        mstime_t rejoin_delay = server.cluster_node_timeout;
-
-        /* If the instance is a master and was partitioned away with the
-         * minority, don't let it accept queries for some time after the
-         * partition heals, to make sure there is enough time to receive
-         * a configuration update. */
-        if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY)
-            rejoin_delay = CLUSTER_MAX_REJOIN_DELAY;
-        if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY)
-            rejoin_delay = CLUSTER_MIN_REJOIN_DELAY;
-
-        if (new_state == CLUSTER_OK &&
-            nodeIsMaster(myself) &&
-            mstime() - among_minority_time < rejoin_delay)
-        {
-            return;
-        }
-
-        /* Change the state and log the event. */
-        serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING,
-            "Cluster state changed: %s",
-            new_state == CLUSTER_OK ? "ok" : "fail");
-        server.cluster->state = new_state;
-    }
-}
-
-/* This function is called after the node startup in order to verify that data
- * loaded from disk is in agreement with the cluster configuration:
- *
- * 1) If we find keys about hash slots we have no responsibility for, the
- *    following happens:
- *    A) If no other node is in charge according to the current cluster
- *       configuration, we add these slots to our node.
- *    B) If according to our config other nodes are already in charge for
- *       this slots, we set the slots as IMPORTING from our point of view
- *       in order to justify we have those slots, and in order to make
- *       redis-cli aware of the issue, so that it can try to fix it.
- * 2) If we find data in a DB different than DB0 we return C_ERR to
- *    signal the caller it should quit the server with an error message
- *    or take other actions.
- *
- * The function always returns C_OK even if it will try to correct
- * the error described in "1". However if data is found in DB different
- * from DB0, C_ERR is returned.
- *
- * The function also uses the logging facility in order to warn the user
- * about desynchronizations between the data we have in memory and the
- * cluster configuration. */
-int verifyClusterConfigWithData(void) {
-    int j;
-    int update_config = 0;
-
-    /* Return ASAP if a module disabled cluster redirections. In that case
-     * every master can store keys about every possible hash slot. */
-    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
-        return C_OK;
-
-    /* If this node is a slave, don't perform the check at all as we
-     * completely depend on the replication stream. */
-    if (nodeIsSlave(myself)) return C_OK;
-
-    /* Make sure we only have keys in DB0. */
-    for (j = 1; j < server.dbnum; j++) {
-        if (dbSize(&server.db[j], DB_MAIN)) return C_ERR;
-    }
-
-    /* Check that all the slots we see populated memory have a corresponding
-     * entry in the cluster table. Otherwise fix the table. */
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (!countKeysInSlot(j)) continue; /* No keys in this slot. */
-        /* Check if we are assigned to this slot or if we are importing it.
-         * In both cases check the next slot as the configuration makes
-         * sense. */
-        if (server.cluster->slots[j] == myself ||
-            server.cluster->importing_slots_from[j] != NULL) continue;
-
-        /* If we are here data and cluster config don't agree, and we have
-         * slot 'j' populated even if we are not importing it, nor we are
-         * assigned to this slot. Fix this condition. */
-
-        update_config++;
-        /* Case A: slot is unassigned. Take responsibility for it. */
-        if (server.cluster->slots[j] == NULL) {
-            serverLog(LL_NOTICE, "I have keys for unassigned slot %d. "
-                                    "Taking responsibility for it.",j);
-            clusterAddSlot(myself,j);
-        } else {
-            serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is "
-                                    "assigned to another node. "
-                                    "Setting it to importing state.",j);
-            server.cluster->importing_slots_from[j] = server.cluster->slots[j];
-        }
-    }
-    if (update_config) clusterSaveConfigOrDie(1);
-    return C_OK;
-}
-
-/* Remove all the shard channel related information not owned by the current shard. */
-static inline void removeAllNotOwnedShardChannelSubscriptions(void) {
-    if (!dictSize(server.pubsubshard_channels)) return;
-    clusterNode *currmaster = nodeIsMaster(myself) ? myself : myself->slaveof;
-    for (int j = 0; j < CLUSTER_SLOTS; j++) {
-        if (server.cluster->slots[j] != currmaster) {
-            removeChannelsInSlot(j);
-        }
-    }
-}
-
-/* -----------------------------------------------------------------------------
- * SLAVE nodes handling
- * -------------------------------------------------------------------------- */
-
-/* Set the specified node 'n' as master for this node.
- * If this node is currently a master, it is turned into a slave. */
-void clusterSetMaster(clusterNode *n) {
-    serverAssert(n != myself);
-    serverAssert(myself->numslots == 0);
-
-    if (nodeIsMaster(myself)) {
-        myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO);
-        myself->flags |= CLUSTER_NODE_SLAVE;
-        clusterCloseAllSlots();
-    } else {
-        if (myself->slaveof)
-            clusterNodeRemoveSlave(myself->slaveof,myself);
-    }
-    myself->slaveof = n;
-    updateShardId(myself, n->shard_id);
-    clusterNodeAddSlave(n,myself);
-    replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n));
-    removeAllNotOwnedShardChannelSubscriptions();
-    resetManualFailover();
-}
-
-/* -----------------------------------------------------------------------------
- * Nodes to string representation functions.
- * -------------------------------------------------------------------------- */
-
-struct redisNodeFlags {
-    uint16_t flag;
-    char *name;
-};
-
-static struct redisNodeFlags redisNodeFlagsTable[] = {
-    {CLUSTER_NODE_MYSELF,       "myself,"},
-    {CLUSTER_NODE_MASTER,       "master,"},
-    {CLUSTER_NODE_SLAVE,        "slave,"},
-    {CLUSTER_NODE_PFAIL,        "fail?,"},
-    {CLUSTER_NODE_FAIL,         "fail,"},
-    {CLUSTER_NODE_HANDSHAKE,    "handshake,"},
-    {CLUSTER_NODE_NOADDR,       "noaddr,"},
-    {CLUSTER_NODE_NOFAILOVER,   "nofailover,"}
-};
-
-/* Concatenate the comma separated list of node flags to the given SDS
- * string 'ci'. */
-sds representClusterNodeFlags(sds ci, uint16_t flags) {
-    size_t orig_len = sdslen(ci);
-    int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags);
-    for (i = 0; i < size; i++) {
-        struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i;
-        if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name);
-    }
-    /* If no flag was added, add the "noflags" special flag. */
-    if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,");
-    sdsIncrLen(ci,-1); /* Remove trailing comma. */
-    return ci;
-}
-
-/* Concatenate the slot ownership information to the given SDS string 'ci'.
- * If the slot ownership is in a contiguous block, it's represented as start-end pair,
- * else each slot is added separately. */
-sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) {
-    for (int i = 0; i< slot_info_pairs_count; i+=2) {
-        unsigned long start = slot_info_pairs[i];
-        unsigned long end = slot_info_pairs[i+1];
-        if (start == end) {
-            ci = sdscatfmt(ci, " %i", start);
-        } else {
-            ci = sdscatfmt(ci, " %i-%i", start, end);
-        }
-    }
-    return ci;
-}
-
-/* Generate a csv-alike representation of the specified cluster node.
- * See clusterGenNodesDescription() top comment for more information.
- *
- * The function returns the string representation as an SDS string. */
-sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) {
-    int j, start;
-    sds ci;
-    int port = getNodeClientPort(node, tls_primary);
-
-    /* Node coordinates */
-    ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN);
-    ci = sdscatfmt(ci," %s:%i@%i",
-        node->ip,
-        port,
-        node->cport);
-    if (sdslen(node->hostname) != 0) {
-        ci = sdscatfmt(ci,",%s", node->hostname);
-    }
-    /* Don't expose aux fields to any clients yet but do allow them
-     * to be persisted to nodes.conf */
-    if (c == NULL) {
-        if (sdslen(node->hostname) == 0) {
-            ci = sdscatfmt(ci,",", 1);
-        }
-        for (int i = af_count-1; i >=0; i--) {
-            if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) {
-                continue;
-            }
-            if (auxFieldHandlers[i].isPresent(node)) {
-                ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field);
-                ci = auxFieldHandlers[i].getter(node, ci);
-            }
-        }
-    }
-
-    /* Flags */
-    ci = sdscatlen(ci," ",1);
-    ci = representClusterNodeFlags(ci, node->flags);
-
-    /* Slave of... or just "-" */
-    ci = sdscatlen(ci," ",1);
-    if (node->slaveof)
-        ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN);
-    else
-        ci = sdscatlen(ci,"-",1);
-
-    unsigned long long nodeEpoch = node->configEpoch;
-    if (nodeIsSlave(node) && node->slaveof) {
-        nodeEpoch = node->slaveof->configEpoch;
-    }
-    /* Latency from the POV of this node, config epoch, link status */
-    ci = sdscatfmt(ci," %I %I %U %s",
-        (long long) node->ping_sent,
-        (long long) node->pong_received,
-        nodeEpoch,
-        (node->link || node->flags & CLUSTER_NODE_MYSELF) ?
-                    "connected" : "disconnected");
-
-    /* Slots served by this instance. If we already have slots info,
-     * append it directly, otherwise, generate slots only if it has. */
-    if (node->slot_info_pairs) {
-        ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count);
-    } else if (node->numslots > 0) {
-        start = -1;
-        for (j = 0; j < CLUSTER_SLOTS; j++) {
-            int bit;
-
-            if ((bit = clusterNodeGetSlotBit(node,j)) != 0) {
-                if (start == -1) start = j;
-            }
-            if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) {
-                if (bit && j == CLUSTER_SLOTS-1) j++;
-
-                if (start == j-1) {
-                    ci = sdscatfmt(ci," %i",start);
-                } else {
-                    ci = sdscatfmt(ci," %i-%i",start,j-1);
-                }
-                start = -1;
-            }
-        }
-    }
-
-    /* Just for MYSELF node we also dump info about slots that
-     * we are migrating to other instances or importing from other
-     * instances. */
-    if (node->flags & CLUSTER_NODE_MYSELF) {
-        for (j = 0; j < CLUSTER_SLOTS; j++) {
-            if (server.cluster->migrating_slots_to[j]) {
-                ci = sdscatprintf(ci," [%d->-%.40s]",j,
-                    server.cluster->migrating_slots_to[j]->name);
-            } else if (server.cluster->importing_slots_from[j]) {
-                ci = sdscatprintf(ci," [%d-<-%.40s]",j,
-                    server.cluster->importing_slots_from[j]->name);
-            }
-        }
-    }
-    return ci;
-}
-
-/* Generate the slot topology for all nodes and store the string representation
- * in the slots_info struct on the node. This is used to improve the efficiency
- * of clusterGenNodesDescription() because it removes looping of the slot space
- * for generating the slot info for each node individually. */
-void clusterGenNodesSlotsInfo(int filter) {
-    clusterNode *n = NULL;
-    int start = -1;
-
-    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
-        /* Find start node and slot id. */
-        if (n == NULL) {
-            if (i == CLUSTER_SLOTS) break;
-            n = server.cluster->slots[i];
-            start = i;
-            continue;
-        }
-
-        /* Generate slots info when occur different node with start
-         * or end of slot. */
-        if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) {
-            if (!(n->flags & filter)) {
-                if (!n->slot_info_pairs) {
-                    n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t));
-                }
-                serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots));
-                n->slot_info_pairs[n->slot_info_pairs_count++] = start;
-                n->slot_info_pairs[n->slot_info_pairs_count++] = i-1;
-            }
-            if (i == CLUSTER_SLOTS) break;
-            n = server.cluster->slots[i];
-            start = i;
-        }
-    }
-}
-
-void clusterFreeNodesSlotsInfo(clusterNode *n) {
-    zfree(n->slot_info_pairs);
-    n->slot_info_pairs = NULL;
-    n->slot_info_pairs_count = 0;
-}
-
-/* Generate a csv-alike representation of the nodes we are aware of,
- * including the "myself" node, and return an SDS string containing the
- * representation (it is up to the caller to free it).
- *
- * All the nodes matching at least one of the node flags specified in
- * "filter" are excluded from the output, so using zero as a filter will
- * include all the known nodes in the representation, including nodes in
- * the HANDSHAKE state.
- *
- * Setting tls_primary to 1 to put TLS port in the main <ip>:<port> 
- * field and put TCP port in aux field, instead of the opposite way.
- *
- * The representation obtained using this function is used for the output
- * of the CLUSTER NODES function, and as format for the cluster
- * configuration file (nodes.conf) for a given node. */
-sds clusterGenNodesDescription(client *c, int filter, int tls_primary) {
-    sds ci = sdsempty(), ni;
-    dictIterator *di;
-    dictEntry *de;
-
-    /* Generate all nodes slots info firstly. */
-    clusterGenNodesSlotsInfo(filter);
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-
-        if (node->flags & filter) continue;
-        ni = clusterGenNodeDescription(c, node, tls_primary);
-        ci = sdscatsds(ci,ni);
-        sdsfree(ni);
-        ci = sdscatlen(ci,"\n",1);
-
-        /* Release slots info. */
-        clusterFreeNodesSlotsInfo(node);
-    }
-    dictReleaseIterator(di);
-    return ci;
-}
-
-/* Add to the output buffer of the given client the description of the given cluster link.
- * The description is a map with each entry being an attribute of the link. */
-void addReplyClusterLinkDescription(client *c, clusterLink *link) {
-    addReplyMapLen(c, 6);
-
-    addReplyBulkCString(c, "direction");
-    addReplyBulkCString(c, link->inbound ? "from" : "to");
-
-    /* addReplyClusterLinkDescription is only called for links that have been
-     * associated with nodes. The association is always bi-directional, so
-     * in addReplyClusterLinkDescription, link->node should never be NULL. */
-    serverAssert(link->node);
-    sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN);
-    addReplyBulkCString(c, "node");
-    addReplyBulkCString(c, node_name);
-    sdsfree(node_name);
-
-    addReplyBulkCString(c, "create-time");
-    addReplyLongLong(c, link->ctime);
-
-    char events[3], *p;
-    p = events;
-    if (link->conn) {
-        if (connHasReadHandler(link->conn)) *p++ = 'r';
-        if (connHasWriteHandler(link->conn)) *p++ = 'w';
-    }
-    *p = '\0';
-    addReplyBulkCString(c, "events");
-    addReplyBulkCString(c, events);
-
-    addReplyBulkCString(c, "send-buffer-allocated");
-    addReplyLongLong(c, link->send_msg_queue_mem);
-
-    addReplyBulkCString(c, "send-buffer-used");
-    addReplyLongLong(c, link->send_msg_queue_mem);
-}
-
-/* Add to the output buffer of the given client an array of cluster link descriptions,
- * with array entry being a description of a single current cluster link. */
-void addReplyClusterLinksDescription(client *c) {
-    dictIterator *di;
-    dictEntry *de;
-    void *arraylen_ptr = NULL;
-    int num_links = 0;
-
-    arraylen_ptr = addReplyDeferredLen(c);
-
-    di = dictGetSafeIterator(server.cluster->nodes);
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        if (node->link) {
-            num_links++;
-            addReplyClusterLinkDescription(c, node->link);
-        }
-        if (node->inbound_link) {
-            num_links++;
-            addReplyClusterLinkDescription(c, node->inbound_link);
-        }
-    }
-    dictReleaseIterator(di);
-
-    setDeferredArrayLen(c, arraylen_ptr, num_links);
-}
-
-/* -----------------------------------------------------------------------------
- * CLUSTER command
- * -------------------------------------------------------------------------- */
-
-const char *getPreferredEndpoint(clusterNode *n) {
-    switch(server.cluster_preferred_endpoint_type) {
-    case CLUSTER_ENDPOINT_TYPE_IP: return n->ip;
-    case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (sdslen(n->hostname) != 0) ? n->hostname : "?";
-    case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return "";
-    }
-    return "unknown";
-}
-
-const char *clusterGetMessageTypeString(int type) {
-    switch(type) {
-    case CLUSTERMSG_TYPE_PING: return "ping";
-    case CLUSTERMSG_TYPE_PONG: return "pong";
-    case CLUSTERMSG_TYPE_MEET: return "meet";
-    case CLUSTERMSG_TYPE_FAIL: return "fail";
-    case CLUSTERMSG_TYPE_PUBLISH: return "publish";
-    case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard";
-    case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req";
-    case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack";
-    case CLUSTERMSG_TYPE_UPDATE: return "update";
-    case CLUSTERMSG_TYPE_MFSTART: return "mfstart";
-    case CLUSTERMSG_TYPE_MODULE: return "module";
-    }
-    return "unknown";
-}
-
-int getSlotOrReply(client *c, robj *o) {
-    long long slot;
-
-    if (getLongLongFromObject(o,&slot) != C_OK ||
-        slot < 0 || slot >= CLUSTER_SLOTS)
-    {
-        addReplyError(c,"Invalid or out of range slot");
-        return -1;
-    }
-    return (int) slot;
-}
-
-/* Returns an indication if the replica node is fully available
- * and should be listed in CLUSTER SLOTS response.
- * Returns 1 for available nodes, 0 for nodes that have 
- * not finished their initial sync, in failed state, or are 
- * otherwise considered not available to serve read commands. */
-static int isReplicaAvailable(clusterNode *node) {
-    if (nodeFailed(node)) {
-        return 0;
-    }
-    long long repl_offset = node->repl_offset;
-    if (node->flags & CLUSTER_NODE_MYSELF) {
-        /* Nodes do not update their own information
-         * in the cluster node list. */
-        repl_offset = replicationGetSlaveOffset();
-    }
-    return (repl_offset != 0);
-}
-
-int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) {
-    int slot;
-    for (slot = start_slot; slot <= end_slot; slot++) {
-        if (del && server.cluster->slots[slot] == NULL) {
-            addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
-            return C_ERR;
-        } else if (!del && server.cluster->slots[slot]) {
-            addReplyErrorFormat(c,"Slot %d is already busy", slot);
-            return C_ERR;
-        }
-        if (slots[slot]++ == 1) {
-            addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot);
-            return C_ERR;
-        }
-    }
-    return C_OK;
-}
-
-void clusterUpdateSlots(client *c, unsigned char *slots, int del) {
-    int j;
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        if (slots[j]) {
-            int retval;
-                
-            /* If this slot was set as importing we can clear this
-             * state as now we are the real owner of the slot. */
-            if (server.cluster->importing_slots_from[j])
-                server.cluster->importing_slots_from[j] = NULL;
-
-            retval = del ? clusterDelSlot(j) :
-                           clusterAddSlot(myself,j);
-            serverAssertWithInfo(c,NULL,retval == C_OK);
-        }
-    }
-}
-
-void addNodeToNodeReply(client *c, clusterNode *node) {
-    addReplyArrayLen(c, 4);
-    if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) {
-        addReplyBulkCString(c, node->ip);
-    } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) {
-        if (sdslen(node->hostname) != 0) {
-            addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname));
-        } else {
-            addReplyBulkCString(c, "?");
-        }
-    } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) {
-        addReplyNull(c);
-    } else {
-        serverPanic("Unrecognized preferred endpoint type");
-    }
-
-    /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
-    addReplyLongLong(c, getNodeClientPort(node, shouldReturnTlsInfo()));
-    addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN);
-
-    /* Add the additional endpoint information, this is all the known networking information
-     * that is not the preferred endpoint. Note the logic is evaluated twice so we can
-     * correctly report the number of additional network arguments without using a deferred
-     * map, an assertion is made at the end to check we set the right length. */
-    int length = 0;
-    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) {
-        length++;
-    }
-    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME
-        && sdslen(node->hostname) != 0)
-    {
-        length++;
-    }
-    addReplyMapLen(c, length);
-
-    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) {
-        addReplyBulkCString(c, "ip");
-        addReplyBulkCString(c, node->ip);
-        length--;
-    }
-    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME
-        && sdslen(node->hostname) != 0)
-    {
-        addReplyBulkCString(c, "hostname");
-        addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname));
-        length--;
-    }
-    serverAssert(length == 0);
-}
-
-void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) {
-    int i, nested_elements = 3; /* slots (2) + master addr (1) */
-    for (i = 0; i < node->numslaves; i++) {
-        if (!isReplicaAvailable(node->slaves[i])) continue;
-        nested_elements++;
-    }
-    addReplyArrayLen(c, nested_elements);
-    addReplyLongLong(c, start_slot);
-    addReplyLongLong(c, end_slot);
-    addNodeToNodeReply(c, node);
-
-    /* Remaining nodes in reply are replicas for slot range */
-    for (i = 0; i < node->numslaves; i++) {
-        /* This loop is copy/pasted from clusterGenNodeDescription()
-         * with modifications for per-slot node aggregation. */
-        if (!isReplicaAvailable(node->slaves[i])) continue;
-        addNodeToNodeReply(c, node->slaves[i]);
-        nested_elements--;
-    }
-    serverAssert(nested_elements == 3); /* Original 3 elements */
-}
-
-/* Add detailed information of a node to the output buffer of the given client. */
-void addNodeDetailsToShardReply(client *c, clusterNode *node) {
-    int reply_count = 0;
-    void *node_replylen = addReplyDeferredLen(c);
-    addReplyBulkCString(c, "id");
-    addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN);
-    reply_count++;
-
-    if (node->tcp_port) {
-        addReplyBulkCString(c, "port");
-        addReplyLongLong(c, node->tcp_port);
-        reply_count++;
-    }
-
-    if (node->tls_port) {
-        addReplyBulkCString(c, "tls-port");
-        addReplyLongLong(c, node->tls_port);
-        reply_count++;
-    }
-
-    addReplyBulkCString(c, "ip");
-    addReplyBulkCString(c, node->ip);
-    reply_count++;
-
-    addReplyBulkCString(c, "endpoint");
-    addReplyBulkCString(c, getPreferredEndpoint(node));
-    reply_count++;
-
-    if (sdslen(node->hostname) != 0) {
-        addReplyBulkCString(c, "hostname");
-        addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname));
-        reply_count++;
-    }
-
-    long long node_offset;
-    if (node->flags & CLUSTER_NODE_MYSELF) {
-        node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset;
-    } else {
-        node_offset = node->repl_offset;
-    }
-
-    addReplyBulkCString(c, "role");
-    addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master");
-    reply_count++;
-
-    addReplyBulkCString(c, "replication-offset");
-    addReplyLongLong(c, node_offset);
-    reply_count++;
-
-    addReplyBulkCString(c, "health");
-    const char *health_msg = NULL;
-    if (nodeFailed(node)) {
-        health_msg = "fail";
-    } else if (nodeIsSlave(node) && node_offset == 0) {
-        health_msg = "loading";
-    } else {
-        health_msg = "online";
-    }
-    addReplyBulkCString(c, health_msg);
-    reply_count++;
-
-    setDeferredMapLen(c, node_replylen, reply_count);
-}
-
-/* Add the shard reply of a single shard based off the given primary node. */
-void addShardReplyForClusterShards(client *c, list *nodes) {
-    serverAssert(listLength(nodes) > 0);
-    clusterNode *n = listNodeValue(listFirst(nodes));
-    addReplyMapLen(c, 2);
-    addReplyBulkCString(c, "slots");
-
-    /* Use slot_info_pairs from the primary only */
-    while (n->slaveof != NULL) n = n->slaveof;
-
-    if (n->slot_info_pairs != NULL) {
-        serverAssert((n->slot_info_pairs_count % 2) == 0);
-        addReplyArrayLen(c, n->slot_info_pairs_count);
-        for (int i = 0; i < n->slot_info_pairs_count; i++)
-            addReplyLongLong(c, (unsigned long)n->slot_info_pairs[i]);
-    } else {
-        /* If no slot info pair is provided, the node owns no slots */
-        addReplyArrayLen(c, 0);
-    }
-
-    addReplyBulkCString(c, "nodes");
-    addReplyArrayLen(c, listLength(nodes));
-    listIter li;
-    listRewind(nodes, &li);
-    for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) {
-        clusterNode *n = listNodeValue(ln);
-        addNodeDetailsToShardReply(c, n);
-        clusterFreeNodesSlotsInfo(n);
-    }
-}
-
-/* Add to the output buffer of the given client, an array of slot (start, end)
- * pair owned by the shard, also the primary and set of replica(s) along with
- * information about each node. */
-void clusterReplyShards(client *c) {
-    addReplyArrayLen(c, dictSize(server.cluster->shards));
-    /* This call will add slot_info_pairs to all nodes */
-    clusterGenNodesSlotsInfo(0);
-    dictIterator *di = dictGetSafeIterator(server.cluster->shards);
-    for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) {
-        addShardReplyForClusterShards(c, dictGetVal(de));
-    }
-    dictReleaseIterator(di);
-}
-
-void clusterReplyMultiBulkSlots(client * c) {
-    /* Format: 1) 1) start slot
-     *            2) end slot
-     *            3) 1) master IP
-     *               2) master port
-     *               3) node ID
-     *            4) 1) replica IP
-     *               2) replica port
-     *               3) node ID
-     *           ... continued until done
-     */
-    clusterNode *n = NULL;
-    int num_masters = 0, start = -1;
-    void *slot_replylen = addReplyDeferredLen(c);
-
-    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
-        /* Find start node and slot id. */
-        if (n == NULL) {
-            if (i == CLUSTER_SLOTS) break;
-            n = server.cluster->slots[i];
-            start = i;
-            continue;
-        }
-
-        /* Add cluster slots info when occur different node with start
-         * or end of slot. */
-        if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) {
-            addNodeReplyForClusterSlot(c, n, start, i-1);
-            num_masters++;
-            if (i == CLUSTER_SLOTS) break;
-            n = server.cluster->slots[i];
-            start = i;
-        }
-    }
-    setDeferredArrayLen(c, slot_replylen, num_masters);
-}
-
-sds genClusterInfoString(void) {
-    sds info = sdsempty();
-    char *statestr[] = {"ok","fail"};
-    int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0;
-    uint64_t myepoch;
-    int j;
-
-    for (j = 0; j < CLUSTER_SLOTS; j++) {
-        clusterNode *n = server.cluster->slots[j];
-
-        if (n == NULL) continue;
-        slots_assigned++;
-        if (nodeFailed(n)) {
-            slots_fail++;
-        } else if (nodeTimedOut(n)) {
-            slots_pfail++;
-        } else {
-            slots_ok++;
-        }
-    }
-
-    myepoch = (nodeIsSlave(myself) && myself->slaveof) ?
-                myself->slaveof->configEpoch : myself->configEpoch;
-
-    info = sdscatprintf(info,
-        "cluster_state:%s\r\n"
-        "cluster_slots_assigned:%d\r\n"
-        "cluster_slots_ok:%d\r\n"
-        "cluster_slots_pfail:%d\r\n"
-        "cluster_slots_fail:%d\r\n"
-        "cluster_known_nodes:%lu\r\n"
-        "cluster_size:%d\r\n"
-        "cluster_current_epoch:%llu\r\n"
-        "cluster_my_epoch:%llu\r\n"
-        , statestr[server.cluster->state],
-        slots_assigned,
-        slots_ok,
-        slots_pfail,
-        slots_fail,
-        dictSize(server.cluster->nodes),
-        server.cluster->size,
-        (unsigned long long) server.cluster->currentEpoch,
-        (unsigned long long) myepoch
-    );
-
-    /* Show stats about messages sent and received. */
-    long long tot_msg_sent = 0;
-    long long tot_msg_received = 0;
-
-    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
-        if (server.cluster->stats_bus_messages_sent[i] == 0) continue;
-        tot_msg_sent += server.cluster->stats_bus_messages_sent[i];
-        info = sdscatprintf(info,
-            "cluster_stats_messages_%s_sent:%lld\r\n",
-            clusterGetMessageTypeString(i),
-            server.cluster->stats_bus_messages_sent[i]);
-    }
-    info = sdscatprintf(info,
-        "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent);
-
-    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
-        if (server.cluster->stats_bus_messages_received[i] == 0) continue;
-        tot_msg_received += server.cluster->stats_bus_messages_received[i];
-        info = sdscatprintf(info,
-            "cluster_stats_messages_%s_received:%lld\r\n",
-            clusterGetMessageTypeString(i),
-            server.cluster->stats_bus_messages_received[i]);
-    }
-    info = sdscatprintf(info,
-        "cluster_stats_messages_received:%lld\r\n", tot_msg_received);
-
-    info = sdscatprintf(info,
-        "total_cluster_links_buffer_limit_exceeded:%llu\r\n",
-        server.cluster->stat_cluster_links_buffer_limit_exceeded);
-
-    return info;
-}
-
-void clusterCommand(client *c) {
-    if (server.cluster_enabled == 0) {
-        addReplyError(c,"This instance has cluster support disabled");
-        return;
-    }
-
-    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) {
-        const char *help[] = {
-"ADDSLOTS <slot> [<slot> ...]",
-"    Assign slots to current node.",
-"ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
-"    Assign slots which are between <start-slot> and <end-slot> to current node.",
-"BUMPEPOCH",
-"    Advance the cluster config epoch.",
-"COUNT-FAILURE-REPORTS <node-id>",
-"    Return number of failure reports for <node-id>.",
-"COUNTKEYSINSLOT <slot>",
-"    Return the number of keys in <slot>.",
-"DELSLOTS <slot> [<slot> ...]",
-"    Delete slots information from current node.",
-"DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
-"    Delete slots information which are between <start-slot> and <end-slot> from current node.",
-"FAILOVER [FORCE|TAKEOVER]",
-"    Promote current replica node to being a master.",
-"FORGET <node-id>",
-"    Remove a node from the cluster.",
-"GETKEYSINSLOT <slot> <count>",
-"    Return key names stored by current node in a slot.",
-"FLUSHSLOTS",
-"    Delete current node own slots information.",
-"INFO",
-"    Return information about the cluster.",
-"KEYSLOT <key>",
-"    Return the hash slot for <key>.",
-"MEET <ip> <port> [<bus-port>]",
-"    Connect nodes into a working cluster.",
-"MYID",
-"    Return the node id.",
-"MYSHARDID",
-"    Return the node's shard id.",
-"NODES",
-"    Return cluster configuration seen by node. Output format:",
-"    <id> <ip:port@bus-port[,hostname]> <flags> <master> <pings> <pongs> <epoch> <link> <slot> ...",
-"REPLICATE <node-id>",
-"    Configure current node as replica to <node-id>.",
-"RESET [HARD|SOFT]",
-"    Reset current node (default: soft).",
-"SET-CONFIG-EPOCH <epoch>",
-"    Set config epoch of current node.",
-"SETSLOT <slot> (IMPORTING <node-id>|MIGRATING <node-id>|STABLE|NODE <node-id>)",
-"    Set slot state.",
-"REPLICAS <node-id>",
-"    Return <node-id> replicas.",
-"SAVECONFIG",
-"    Force saving cluster configuration on disk.",
-"SLOTS",
-"    Return information about slots range mappings. Each range is made of:",
-"    start, end, master and replicas IP addresses, ports and ids",
-"SHARDS",
-"    Return information about slot range mappings and the nodes associated with them.",
-"LINKS",
-"    Return information about all network links between this node and its peers.",
-"    Output format is an array where each array element is a map containing attributes of a link",
-NULL
-        };
-        addReplyHelp(c, help);
-    } else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) {
-        /* CLUSTER MEET <ip> <port> [cport] */
-        long long port, cport;
-
-        if (getLongLongFromObject(c->argv[3], &port) != C_OK) {
-            addReplyErrorFormat(c,"Invalid base port specified: %s",
-                                (char*)c->argv[3]->ptr);
-            return;
-        }
-
-        if (c->argc == 5) {
-            if (getLongLongFromObject(c->argv[4], &cport) != C_OK) {
-                addReplyErrorFormat(c,"Invalid bus port specified: %s",
-                                    (char*)c->argv[4]->ptr);
-                return;
-            }
-        } else {
-            cport = port + CLUSTER_PORT_INCR;
-        }
-
-        if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 &&
-            errno == EINVAL)
-        {
-            addReplyErrorFormat(c,"Invalid node address specified: %s:%s",
-                            (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr);
-        } else {
-            addReply(c,shared.ok);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) {
-        /* CLUSTER NODES */
-        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
-        sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo());
-        addReplyVerbatim(c,nodes,sdslen(nodes),"txt");
-        sdsfree(nodes);
-    } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) {
-        /* CLUSTER MYID */
-        addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN);
-    } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) {
-        /* CLUSTER MYSHARDID */
-        addReplyBulkCBuffer(c,myself->shard_id, CLUSTER_NAMELEN);
-    } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) {
-        /* CLUSTER SLOTS */
-        clusterReplyMultiBulkSlots(c);
-    } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) {
-        /* CLUSTER SHARDS */
-        clusterReplyShards(c);
-    } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) {
-        /* CLUSTER FLUSHSLOTS */
-        if (dbSize(&server.db[0], DB_MAIN) != 0) {
-            addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS.");
-            return;
-        }
-        clusterDelNodeSlots(myself);
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-        addReply(c,shared.ok);
-    } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
-               !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3)
-    {
-        /* CLUSTER ADDSLOTS <slot> [slot] ... */
-        /* CLUSTER DELSLOTS <slot> [slot] ... */
-        int j, slot;
-        unsigned char *slots = zmalloc(CLUSTER_SLOTS);
-        int del = !strcasecmp(c->argv[1]->ptr,"delslots");
-
-        memset(slots,0,CLUSTER_SLOTS);
-        /* Check that all the arguments are parseable.*/
-        for (j = 2; j < c->argc; j++) {
-            if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
-                zfree(slots);
-                return;
-            }
-        }
-        /* Check that the slots are not already busy. */
-        for (j = 2; j < c->argc; j++) {
-            slot = getSlotOrReply(c,c->argv[j]);
-            if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) {
-                zfree(slots);
-                return;
-            }
-        }
-        clusterUpdateSlots(c, slots, del);    
-        zfree(slots);
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-        addReply(c,shared.ok);
-    } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") ||
-               !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) {
-        if (c->argc % 2 == 1) {
-            addReplyErrorArity(c);
-            return;
-        }
-        /* CLUSTER ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] */
-        /* CLUSTER DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] */
-        int j, startslot, endslot;
-        unsigned char *slots = zmalloc(CLUSTER_SLOTS);
-        int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange");
-
-        memset(slots,0,CLUSTER_SLOTS);
-        /* Check that all the arguments are parseable and that all the
-         * slots are not already busy. */
-        for (j = 2; j < c->argc; j += 2) {
-            if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
-                zfree(slots);
-                return;
-            }
-            if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) {
-                zfree(slots);
-                return;
-            }
-            if (startslot > endslot) {
-                addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot);
-                zfree(slots);
-                return;
-            }
-
-            if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) {
-                zfree(slots);
-                return;
-            }
-        }
-        clusterUpdateSlots(c, slots, del);
-        zfree(slots);
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-        addReply(c,shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) {
-        /* SETSLOT 10 MIGRATING <node ID> */
-        /* SETSLOT 10 IMPORTING <node ID> */
-        /* SETSLOT 10 STABLE */
-        /* SETSLOT 10 NODE <node ID> */
-        int slot;
-        clusterNode *n;
-
-        if (nodeIsSlave(myself)) {
-            addReplyError(c,"Please use SETSLOT only with masters.");
-            return;
-        }
-
-        if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return;
-
-        if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) {
-            if (server.cluster->slots[slot] != myself) {
-                addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
-                return;
-            }
-            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
-            if (n == NULL) {
-                addReplyErrorFormat(c,"I don't know about node %s",
-                    (char*)c->argv[4]->ptr);
-                return;
-            }
-            if (nodeIsSlave(n)) {
-                addReplyError(c,"Target node is not a master");
-                return;
-            }
-            server.cluster->migrating_slots_to[slot] = n;
-        } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) {
-            if (server.cluster->slots[slot] == myself) {
-                addReplyErrorFormat(c,
-                    "I'm already the owner of hash slot %u",slot);
-                return;
-            }
-            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
-            if (n == NULL) {
-                addReplyErrorFormat(c,"I don't know about node %s",
-                    (char*)c->argv[4]->ptr);
-                return;
-            }
-            if (nodeIsSlave(n)) {
-                addReplyError(c,"Target node is not a master");
-                return;
-            }
-            server.cluster->importing_slots_from[slot] = n;
-        } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) {
-            /* CLUSTER SETSLOT <SLOT> STABLE */
-            server.cluster->importing_slots_from[slot] = NULL;
-            server.cluster->migrating_slots_to[slot] = NULL;
-        } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) {
-            /* CLUSTER SETSLOT <SLOT> NODE <NODE ID> */
-            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
-            if (!n) {
-                addReplyErrorFormat(c,"Unknown node %s",
-                    (char*)c->argv[4]->ptr);
-                return;
-            }
-            if (nodeIsSlave(n)) {
-                addReplyError(c,"Target node is not a master");
-                return;
-            }
-            /* If this hash slot was served by 'myself' before to switch
-             * make sure there are no longer local keys for this hash slot. */
-            if (server.cluster->slots[slot] == myself && n != myself) {
-                if (countKeysInSlot(slot) != 0) {
-                    addReplyErrorFormat(c,
-                        "Can't assign hashslot %d to a different node "
-                        "while I still hold keys for this hash slot.", slot);
-                    return;
-                }
-            }
-            /* If this slot is in migrating status but we have no keys
-             * for it assigning the slot to another node will clear
-             * the migrating status. */
-            if (countKeysInSlot(slot) == 0 &&
-                server.cluster->migrating_slots_to[slot])
-                server.cluster->migrating_slots_to[slot] = NULL;
-
-            int slot_was_mine = server.cluster->slots[slot] == myself;
-            clusterDelSlot(slot);
-            clusterAddSlot(n,slot);
-
-            /* If we are a master left without slots, we should turn into a
-             * replica of the new master. */
-            if (slot_was_mine &&
-                n != myself &&
-                myself->numslots == 0 &&
-                server.cluster_allow_replica_migration)
-            {
-                serverLog(LL_NOTICE,
-                          "Configuration change detected. Reconfiguring myself "
-                          "as a replica of %.40s (%s)", n->name, n->human_nodename);
-                clusterSetMaster(n);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG |
-                                     CLUSTER_TODO_UPDATE_STATE |
-                                     CLUSTER_TODO_FSYNC_CONFIG);
-            }
-
-            /* If this node was importing this slot, assigning the slot to
-             * itself also clears the importing status. */
-            if (n == myself &&
-                server.cluster->importing_slots_from[slot])
-            {
-                /* This slot was manually migrated, set this node configEpoch
-                 * to a new epoch so that the new version can be propagated
-                 * by the cluster.
-                 *
-                 * Note that if this ever results in a collision with another
-                 * node getting the same configEpoch, for example because a
-                 * failover happens at the same time we close the slot, the
-                 * configEpoch collision resolution will fix it assigning
-                 * a different epoch to each node. */
-                if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                    serverLog(LL_NOTICE,
-                        "configEpoch updated after importing slot %d", slot);
-                }
-                server.cluster->importing_slots_from[slot] = NULL;
-                /* After importing this slot, let the other nodes know as
-                 * soon as possible. */
-                clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
-            }
-        } else {
-            addReplyError(c,
-                "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP");
-            return;
-        }
-        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE);
-        addReply(c,shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) {
-        /* CLUSTER BUMPEPOCH */
-        int retval = clusterBumpConfigEpochWithoutConsensus();
-        sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n",
-                (retval == C_OK) ? "BUMPED" : "STILL",
-                (unsigned long long) myself->configEpoch);
-        addReplySds(c,reply);
-    } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
-        /* CLUSTER INFO */
-       
-        sds info = genClusterInfoString();
-
-        /* Produce the reply protocol. */
-        addReplyVerbatim(c,info,sdslen(info),"txt");
-        sdsfree(info);
-    } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) {
-        int retval = clusterSaveConfig(1);
-
-        if (retval == 0)
-            addReply(c,shared.ok);
-        else
-            addReplyErrorFormat(c,"error saving the cluster node config: %s",
-                strerror(errno));
-    } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) {
-        /* CLUSTER KEYSLOT <key> */
-        sds key = c->argv[2]->ptr;
-
-        addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
-    } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) {
-        /* CLUSTER COUNTKEYSINSLOT <slot> */
-        long long slot;
-
-        if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
-            return;
-        if (slot < 0 || slot >= CLUSTER_SLOTS) {
-            addReplyError(c,"Invalid slot");
-            return;
-        }
-        addReplyLongLong(c,countKeysInSlot(slot));
-    } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) {
-        /* CLUSTER GETKEYSINSLOT <slot> <count> */
-        long long maxkeys, slot;
-
-        if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
-            return;
-        if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL)
-            != C_OK)
-            return;
-        if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) {
-            addReplyError(c,"Invalid slot or number of keys");
-            return;
-        }
-
-        unsigned int keys_in_slot = countKeysInSlot(slot);
-        unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys;
-        addReplyArrayLen(c,numkeys);
-        dictIterator *iter = NULL;
-        dictEntry *de = NULL;
-        iter = dictGetIterator(server.db->dict[slot]);
-        for (unsigned int i = 0; i < numkeys; i++) {
-            de = dictNext(iter);
-            serverAssert(de != NULL);
-            sds sdskey = dictGetKey(de);
-            addReplyBulkCBuffer(c, sdskey, sdslen(sdskey));
-        }
-        dictReleaseIterator(iter);
-    } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) {
-        /* CLUSTER FORGET <NODE ID> */
-        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
-        if (!n) {
-            if (clusterBlacklistExists((char*)c->argv[2]->ptr))
-                /* Already forgotten. The deletion may have been gossipped by
-                 * another node, so we pretend it succeeded. */
-                addReply(c,shared.ok);
-            else
-                addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
-            return;
-        } else if (n == myself) {
-            addReplyError(c,"I tried hard but I can't forget myself...");
-            return;
-        } else if (nodeIsSlave(myself) && myself->slaveof == n) {
-            addReplyError(c,"Can't forget my master!");
-            return;
-        }
-        clusterBlacklistAddNode(n);
-        clusterDelNode(n);
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
-                             CLUSTER_TODO_SAVE_CONFIG);
-        addReply(c,shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) {
-        /* CLUSTER REPLICATE <NODE ID> */
-        /* Lookup the specified node in our table. */
-        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
-        if (!n) {
-            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
-            return;
-        }
-
-        /* I can't replicate myself. */
-        if (n == myself) {
-            addReplyError(c,"Can't replicate myself");
-            return;
-        }
-
-        /* Can't replicate a slave. */
-        if (nodeIsSlave(n)) {
-            addReplyError(c,"I can only replicate a master, not a replica.");
-            return;
-        }
-
-        /* If the instance is currently a master, it should have no assigned
-         * slots nor keys to accept to replicate some other node.
-         * Slaves can switch to another master without issues. */
-        if (nodeIsMaster(myself) &&
-            (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) {
-            addReplyError(c,
-                "To set a master the node must be empty and "
-                "without assigned slots.");
-            return;
-        }
-
-        /* Set the master. */
-        clusterSetMaster(n);
-        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
-        addReply(c,shared.ok);
-    } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") ||
-                !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) {
-        /* CLUSTER SLAVES <NODE ID> */
-        /* CLUSTER REPLICAS <NODE ID> */
-        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
-        int j;
-
-        /* Lookup the specified node in our table. */
-        if (!n) {
-            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
-            return;
-        }
-
-        if (nodeIsSlave(n)) {
-            addReplyError(c,"The specified node is not a master");
-            return;
-        }
-
-        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
-        addReplyArrayLen(c,n->numslaves);
-        for (j = 0; j < n->numslaves; j++) {
-            sds ni = clusterGenNodeDescription(c, n->slaves[j], shouldReturnTlsInfo());
-            addReplyBulkCString(c,ni);
-            sdsfree(ni);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") &&
-               c->argc == 3)
-    {
-        /* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */
-        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
-
-        if (!n) {
-            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
-            return;
-        } else {
-            addReplyLongLong(c,clusterNodeFailureReportsCount(n));
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
-               (c->argc == 2 || c->argc == 3))
-    {
-        /* CLUSTER FAILOVER [FORCE|TAKEOVER] */
-        int force = 0, takeover = 0;
-
-        if (c->argc == 3) {
-            if (!strcasecmp(c->argv[2]->ptr,"force")) {
-                force = 1;
-            } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) {
-                takeover = 1;
-                force = 1; /* Takeover also implies force. */
-            } else {
-                addReplyErrorObject(c,shared.syntaxerr);
-                return;
-            }
-        }
-
-        /* Check preconditions. */
-        if (nodeIsMaster(myself)) {
-            addReplyError(c,"You should send CLUSTER FAILOVER to a replica");
-            return;
-        } else if (myself->slaveof == NULL) {
-            addReplyError(c,"I'm a replica but my master is unknown to me");
-            return;
-        } else if (!force &&
-                   (nodeFailed(myself->slaveof) ||
-                    myself->slaveof->link == NULL))
-        {
-            addReplyError(c,"Master is down or failed, "
-                            "please use CLUSTER FAILOVER FORCE");
-            return;
-        }
-        resetManualFailover();
-        server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT;
-
-        if (takeover) {
-            /* A takeover does not perform any initial check. It just
-             * generates a new configuration epoch for this node without
-             * consensus, claims the master's slots, and broadcast the new
-             * configuration. */
-            serverLog(LL_NOTICE,"Taking over the master (user request).");
-            clusterBumpConfigEpochWithoutConsensus();
-            clusterFailoverReplaceYourMaster();
-        } else if (force) {
-            /* If this is a forced failover, we don't need to talk with our
-             * master to agree about the offset. We just failover taking over
-             * it without coordination. */
-            serverLog(LL_NOTICE,"Forced failover user request accepted.");
-            server.cluster->mf_can_start = 1;
-        } else {
-            serverLog(LL_NOTICE,"Manual failover user request accepted.");
-            clusterSendMFStart(myself->slaveof);
-        }
-        addReply(c,shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3)
-    {
-        /* CLUSTER SET-CONFIG-EPOCH <epoch>
-         *
-         * The user is allowed to set the config epoch only when a node is
-         * totally fresh: no config epoch, no other known node, and so forth.
-         * This happens at cluster creation time to start with a cluster where
-         * every node has a different node ID, without to rely on the conflicts
-         * resolution system which is too slow when a big cluster is created. */
-        long long epoch;
-
-        if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK)
-            return;
-
-        if (epoch < 0) {
-            addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch);
-        } else if (dictSize(server.cluster->nodes) > 1) {
-            addReplyError(c,"The user can assign a config epoch only when the "
-                            "node does not know any other node.");
-        } else if (myself->configEpoch != 0) {
-            addReplyError(c,"Node config epoch is already non-zero");
-        } else {
-            myself->configEpoch = epoch;
-            serverLog(LL_NOTICE,
-                "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH",
-                (unsigned long long) myself->configEpoch);
-
-            if (server.cluster->currentEpoch < (uint64_t)epoch)
-                server.cluster->currentEpoch = epoch;
-            /* No need to fsync the config here since in the unlucky event
-             * of a failure to persist the config, the conflict resolution code
-             * will assign a unique config to this node. */
-            clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
-                                 CLUSTER_TODO_SAVE_CONFIG);
-            addReply(c,shared.ok);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr,"reset") &&
-               (c->argc == 2 || c->argc == 3))
-    {
-        /* CLUSTER RESET [SOFT|HARD] */
-        int hard = 0;
-
-        /* Parse soft/hard argument. Default is soft. */
-        if (c->argc == 3) {
-            if (!strcasecmp(c->argv[2]->ptr,"hard")) {
-                hard = 1;
-            } else if (!strcasecmp(c->argv[2]->ptr,"soft")) {
-                hard = 0;
-            } else {
-                addReplyErrorObject(c,shared.syntaxerr);
-                return;
-            }
-        }
-
-        /* Slaves can be reset while containing data, but not master nodes
-         * that must be empty. */
-        if (nodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) {
-            addReplyError(c,"CLUSTER RESET can't be called with "
-                            "master nodes containing keys");
-            return;
-        }
-        clusterReset(hard);
-        addReply(c,shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) {
-        /* CLUSTER LINKS */
-        addReplyClusterLinksDescription(c);
-    } else {
-        addReplySubcommandSyntaxError(c);
-        return;
-    }
-}
-
-void removeChannelsInSlot(unsigned int slot) {
-    unsigned int channelcount = countChannelsInSlot(slot);
-    if (channelcount == 0) return;
-
-    /* Retrieve all the channels for the slot. */
-    robj **channels = zmalloc(sizeof(robj*)*channelcount);
-    raxIterator iter;
-    int j = 0;
-    unsigned char indexed[2];
-
-    indexed[0] = (slot >> 8) & 0xff;
-    indexed[1] = slot & 0xff;
-    raxStart(&iter,server.cluster->slots_to_channels);
-    raxSeek(&iter,">=",indexed,2);
-    while(raxNext(&iter)) {
-        if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break;
-        channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2);
-    }
-    raxStop(&iter);
-
-    pubsubUnsubscribeShardChannels(channels, channelcount);
-    zfree(channels);
+    return connectionTypeTcp();
 }
 
 /* -----------------------------------------------------------------------------
@@ -6638,7 +176,7 @@ void restoreCommand(client *c) {
                    lfu_freq == -1)
         {
             if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL)
-                    != C_OK) return;
+                != C_OK) return;
             if (lru_idle < 0) {
                 addReplyError(c,"Invalid IDLETIME value, must be >= 0");
                 return;
@@ -6649,7 +187,7 @@ void restoreCommand(client *c) {
                    lru_idle == -1)
         {
             if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL)
-                    != C_OK) return;
+                != C_OK) return;
             if (lfu_freq < 0 || lfu_freq > 255) {
                 addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255");
                 return;
@@ -6728,7 +266,6 @@ void restoreCommand(client *c) {
     addReply(c,shared.ok);
     server.dirty++;
 }
-
 /* MIGRATE socket cache implementation.
  *
  * We take a map between host:ip and a TCP socket that we used to connect
@@ -6784,7 +321,7 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti
     /* Create the connection */
     conn = connCreate(connTypeOfCluster());
     if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout)
-            != C_OK) {
+        != C_OK) {
         addReplyError(c,"-IOERR error or timeout connecting to the client");
         connClose(conn);
         sdsfree(name);
@@ -6891,8 +428,8 @@ void migrateCommand(client *c) {
         } else if (!strcasecmp(c->argv[j]->ptr,"keys")) {
             if (sdslen(c->argv[3]->ptr) != 0) {
                 addReplyError(c,
-                    "When using MIGRATE KEYS option, the key argument"
-                    " must be set to the empty string");
+                              "When using MIGRATE KEYS option, the key argument"
+                              " must be set to the empty string");
                 return;
             }
             first_key = j+1;
@@ -6934,7 +471,7 @@ void migrateCommand(client *c) {
         return;
     }
 
-try_again:
+    try_again:
     write_error = 0;
 
     /* Connect */
@@ -6953,10 +490,10 @@ try_again:
         serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4));
         if (username) {
             serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username,
-                                 sdslen(username)));
+                                                           sdslen(username)));
         }
         serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password,
-            sdslen(password)));
+                                                       sdslen(password)));
     }
 
     /* Send the SELECT command if the current DB is not already selected. */
@@ -6992,24 +529,24 @@ try_again:
         kv[non_expired++] = kv[j];
 
         serverAssertWithInfo(c,NULL,
-            rioWriteBulkCount(&cmd,'*',replace ? 5 : 4));
+                             rioWriteBulkCount(&cmd,'*',replace ? 5 : 4));
 
         if (server.cluster_enabled)
             serverAssertWithInfo(c,NULL,
-                rioWriteBulkString(&cmd,"RESTORE-ASKING",14));
+                                 rioWriteBulkString(&cmd,"RESTORE-ASKING",14));
         else
             serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7));
         serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j]));
         serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr,
-                sdslen(kv[j]->ptr)));
+                                                       sdslen(kv[j]->ptr)));
         serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl));
 
         /* Emit the payload argument, that is the serialized object using
          * the DUMP format. */
         createDumpPayload(&payload,ov[j],kv[j],dbid);
         serverAssertWithInfo(c,NULL,
-            rioWriteBulkString(&cmd,payload.io.buffer.ptr,
-                               sdslen(payload.io.buffer.ptr)));
+                             rioWriteBulkString(&cmd,payload.io.buffer.ptr,
+                                                sdslen(payload.io.buffer.ptr)));
         sdsfree(payload.io.buffer.ptr);
 
         /* Add the REPLACE option to the RESTORE command if it was specified
@@ -7081,7 +618,7 @@ try_again:
 
                 error_from_target = 1;
                 addReplyErrorFormat(c,"Target instance replied with error: %s",
-                    errbuf+1);
+                                    errbuf+1);
             }
         } else {
             if (!copy) {
@@ -7157,7 +694,7 @@ try_again:
 /* On socket errors we try to close the cached socket and try again.
  * It is very common for the cached socket to get closed, if just reopening
  * it works it's a shame to notify the error to the caller. */
-socket_err:
+    socket_err:
     /* Cleanup we want to perform in both the retry and no retry case.
      * Note: Closing the migrate socket will also force SELECT next time. */
     sdsfree(cmd.io.buffer.ptr);
@@ -7180,11 +717,693 @@ socket_err:
     /* Cleanup we want to do if no retry is attempted. */
     zfree(ov); zfree(kv);
     addReplyErrorSds(c, sdscatprintf(sdsempty(),
-                                  "-IOERR error or timeout %s to target instance",
-                                  write_error ? "writing" : "reading"));
+                                     "-IOERR error or timeout %s to target instance",
+                                     write_error ? "writing" : "reading"));
     return;
 }
 
+/* Cluster node sanity check. Returns C_OK if the node id
+ * is valid an C_ERR otherwise. */
+int verifyClusterNodeId(const char *name, int length) {
+    if (length != CLUSTER_NAMELEN) return C_ERR;
+    for (int i = 0; i < length; i++) {
+        if (name[i] >= 'a' && name[i] <= 'z') continue;
+        if (name[i] >= '0' && name[i] <= '9') continue;
+        return C_ERR;
+    }
+    return C_OK;
+}
+
+int isValidAuxChar(int c) {
+    return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL);
+}
+
+int isValidAuxString(char *s, unsigned int length) {
+    for (unsigned i = 0; i < length; i++) {
+        if (!isValidAuxChar(s[i])) return 0;
+    }
+    return 1;
+}
+
+void clusterCommandMyId(client *c) {
+    char *name = clusterNodeGetName(getMyClusterNode());
+    if (name) {
+        addReplyBulkCBuffer(c,name, CLUSTER_NAMELEN);
+    } else {
+        addReplyError(c, "No ID yet");
+    }
+}
+
+char* getMyClusterId(void) {
+    return clusterNodeGetName(getMyClusterNode());
+}
+
+void clusterCommandMyShardId(client *c) {
+    char *sid = clusterNodeGetShardId(getMyClusterNode());
+    if (sid) {
+        addReplyBulkCBuffer(c,sid, CLUSTER_NAMELEN);
+    } else {
+        addReplyError(c, "No shard ID yet");
+    }
+}
+
+/* When a cluster command is called, we need to decide whether to return TLS info or
+ * non-TLS info by the client's connection type. However if the command is called by
+ * a Lua script or RM_call, there is no connection in the fake client, so we use
+ * server.current_client here to get the real client if available. And if it is not
+ * available (modules may call commands without a real client), we return the default
+ * info, which is determined by server.tls_cluster. */
+static int shouldReturnTlsInfo(void) {
+    if (server.current_client && server.current_client->conn) {
+        return connIsTLS(server.current_client->conn);
+    } else {
+        return server.tls_cluster;
+    }
+}
+
+unsigned int countKeysInSlot(unsigned int slot) {
+    return dictSize(server.db->dict[slot]);
+}
+
+void clusterCommandHelp(client *c) {
+    const char *help[] = {
+            "COUNTKEYSINSLOT <slot>",
+            "    Return the number of keys in <slot>.",
+            "GETKEYSINSLOT <slot> <count>",
+            "    Return key names stored by current node in a slot.",
+            "INFO",
+            "    Return information about the cluster.",
+            "KEYSLOT <key>",
+            "    Return the hash slot for <key>.",
+            "MYID",
+            "    Return the node id.",
+            "MYSHARDID",
+            "    Return the node's shard id.",
+            "NODES",
+            "    Return cluster configuration seen by node. Output format:",
+            "    <id> <ip:port@bus-port[,hostname]> <flags> <master> <pings> <pongs> <epoch> <link> <slot> ...",
+            "REPLICAS <node-id>",
+            "    Return <node-id> replicas.",
+            "SLOTS",
+            "    Return information about slots range mappings. Each range is made of:",
+            "    start, end, master and replicas IP addresses, ports and ids",
+            "SHARDS",
+            "    Return information about slot range mappings and the nodes associated with them.",
+            NULL
+    };
+
+    addExtendedReplyHelp(c, help, clusterCommandExtendedHelp());
+}
+
+void clusterCommand(client *c) {
+    if (server.cluster_enabled == 0) {
+        addReplyError(c,"This instance has cluster support disabled");
+        return;
+    }
+
+    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) {
+        clusterCommandHelp(c);
+    } else  if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) {
+        /* CLUSTER NODES */
+        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
+        sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo());
+        addReplyVerbatim(c,nodes,sdslen(nodes),"txt");
+        sdsfree(nodes);
+    } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) {
+        /* CLUSTER MYID */
+        clusterCommandMyId(c);
+    } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) {
+        /* CLUSTER MYSHARDID */
+        clusterCommandMyShardId(c);
+    } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) {
+        /* CLUSTER SLOTS */
+        clusterCommandSlots(c);
+    } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) {
+        /* CLUSTER SHARDS */
+        clusterCommandShards(c);
+    } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
+        /* CLUSTER INFO */
+
+        sds info = genClusterInfoString();
+
+        /* Produce the reply protocol. */
+        addReplyVerbatim(c,info,sdslen(info),"txt");
+        sdsfree(info);
+    } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) {
+        /* CLUSTER KEYSLOT <key> */
+        sds key = c->argv[2]->ptr;
+
+        addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
+    } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) {
+        /* CLUSTER COUNTKEYSINSLOT <slot> */
+        long long slot;
+
+        if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
+            return;
+        if (slot < 0 || slot >= CLUSTER_SLOTS) {
+            addReplyError(c,"Invalid slot");
+            return;
+        }
+        addReplyLongLong(c,countKeysInSlot(slot));
+    } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) {
+        /* CLUSTER GETKEYSINSLOT <slot> <count> */
+        long long maxkeys, slot;
+
+        if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
+            return;
+        if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL)
+            != C_OK)
+            return;
+        if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) {
+            addReplyError(c,"Invalid slot or number of keys");
+            return;
+        }
+
+        unsigned int keys_in_slot = countKeysInSlot(slot);
+        unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys;
+        addReplyArrayLen(c,numkeys);
+        dictIterator *iter = NULL;
+        dictEntry *de = NULL;
+        iter = dictGetIterator(server.db->dict[slot]);
+        for (unsigned int i = 0; i < numkeys; i++) {
+            de = dictNext(iter);
+            serverAssert(de != NULL);
+            sds sdskey = dictGetKey(de);
+            addReplyBulkCBuffer(c, sdskey, sdslen(sdskey));
+        }
+        dictReleaseIterator(iter);
+    } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") ||
+                !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) {
+        /* CLUSTER SLAVES <NODE ID> */
+        /* CLUSTER REPLICAS <NODE ID> */
+        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
+        int j;
+
+        /* Lookup the specified node in our table. */
+        if (!n) {
+            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+            return;
+        }
+
+        if (clusterNodeIsSlave(n)) {
+            addReplyError(c,"The specified node is not a master");
+            return;
+        }
+
+        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
+        addReplyArrayLen(c, clusterNodeNumSlaves(n));
+        for (j = 0; j < clusterNodeNumSlaves(n); j++) {
+            sds ni = clusterGenNodeDescription(c, clusterNodeGetSlave(n, j), shouldReturnTlsInfo());
+            addReplyBulkCString(c,ni);
+            sdsfree(ni);
+        }
+    } else if(!clusterCommandSpecial(c)) {
+        addReplySubcommandSyntaxError(c);
+        return;
+    }
+}
+
+/* Return the pointer to the cluster node that is able to serve the command.
+ * For the function to succeed the command should only target either:
+ *
+ * 1) A single key (even multiple times like RPOPLPUSH mylist mylist).
+ * 2) Multiple keys in the same hash slot, while the slot is stable (no
+ *    resharding in progress).
+ *
+ * On success the function returns the node that is able to serve the request.
+ * If the node is not 'myself' a redirection must be performed. The kind of
+ * redirection is specified setting the integer passed by reference
+ * 'error_code', which will be set to CLUSTER_REDIR_ASK or
+ * CLUSTER_REDIR_MOVED.
+ *
+ * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE.
+ *
+ * If the command fails NULL is returned, and the reason of the failure is
+ * provided via 'error_code', which will be set to:
+ *
+ * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that
+ * don't belong to the same hash slot.
+ *
+ * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys
+ * belonging to the same slot, but the slot is not stable (in migration or
+ * importing state, likely because a resharding is in progress).
+ *
+ * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is
+ * not bound to any node. In this case the cluster global state should be
+ * already "down" but it is fragile to rely on the update of the global state,
+ * so we also handle it here.
+ *
+ * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is
+ * down but the user attempts to execute a command that addresses one or more keys. */
+clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
+    clusterNode *myself = getMyClusterNode();
+    clusterNode *n = NULL;
+    robj *firstkey = NULL;
+    int multiple_keys = 0;
+    multiState *ms, _ms;
+    multiCmd mc;
+    int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0,
+            existing_keys = 0;
+
+    /* Allow any key to be set if a module disabled cluster redirections. */
+    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
+        return myself;
+
+    /* Set error code optimistically for the base case. */
+    if (error_code) *error_code = CLUSTER_REDIR_NONE;
+
+    /* Modules can turn off Redis Cluster redirection: this is useful
+     * when writing a module that implements a completely different
+     * distributed system. */
+
+    /* We handle all the cases as if they were EXEC commands, so we have
+     * a common code path for everything */
+    if (cmd->proc == execCommand) {
+        /* If CLIENT_MULTI flag is not set EXEC is just going to return an
+         * error. */
+        if (!(c->flags & CLIENT_MULTI)) return myself;
+        ms = &c->mstate;
+    } else {
+        /* In order to have a single codepath create a fake Multi State
+         * structure if the client is not in MULTI/EXEC state, this way
+         * we have a single codepath below. */
+        ms = &_ms;
+        _ms.commands = &mc;
+        _ms.count = 1;
+        mc.argv = argv;
+        mc.argc = argc;
+        mc.cmd = cmd;
+    }
+
+    int is_pubsubshard = cmd->proc == ssubscribeCommand ||
+                         cmd->proc == sunsubscribeCommand ||
+                         cmd->proc == spublishCommand;
+
+    /* Check that all the keys are in the same hash slot, and obtain this
+     * slot and the node associated. */
+    for (i = 0; i < ms->count; i++) {
+        struct redisCommand *mcmd;
+        robj **margv;
+        int margc, numkeys, j;
+        keyReference *keyindex;
+
+        mcmd = ms->commands[i].cmd;
+        margc = ms->commands[i].argc;
+        margv = ms->commands[i].argv;
+
+        getKeysResult result = GETKEYS_RESULT_INIT;
+        numkeys = getKeysFromCommand(mcmd,margv,margc,&result);
+        keyindex = result.keys;
+
+        for (j = 0; j < numkeys; j++) {
+            robj *thiskey = margv[keyindex[j].pos];
+            int thisslot = keyHashSlot((char*)thiskey->ptr,
+                                       sdslen(thiskey->ptr));
+
+            if (firstkey == NULL) {
+                /* This is the first key we see. Check what is the slot
+                 * and node. */
+                firstkey = thiskey;
+                slot = thisslot;
+                n = getNodeBySlot(slot);
+
+                /* Error: If a slot is not served, we are in "cluster down"
+                 * state. However the state is yet to be updated, so this was
+                 * not trapped earlier in processCommand(). Report the same
+                 * error to the client. */
+                if (n == NULL) {
+                    getKeysFreeResult(&result);
+                    if (error_code)
+                        *error_code = CLUSTER_REDIR_DOWN_UNBOUND;
+                    return NULL;
+                }
+
+                /* If we are migrating or importing this slot, we need to check
+                 * if we have all the keys in the request (the only way we
+                 * can safely serve the request, otherwise we return a TRYAGAIN
+                 * error). To do so we set the importing/migrating state and
+                 * increment a counter for every missing key. */
+                if (n == myself &&
+                    getMigratingSlotDest(slot) != NULL)
+                {
+                    migrating_slot = 1;
+                } else if (getImportingSlotSource(slot) != NULL) {
+                    importing_slot = 1;
+                }
+            } else {
+                /* If it is not the first key/channel, make sure it is exactly
+                 * the same key/channel as the first we saw. */
+                if (slot != thisslot) {
+                    /* Error: multiple keys from different slots. */
+                    getKeysFreeResult(&result);
+                    if (error_code)
+                        *error_code = CLUSTER_REDIR_CROSS_SLOT;
+                    return NULL;
+                }
+                if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) {
+                    /* Flag this request as one with multiple different
+                     * keys/channels when the slot is in importing state. */
+                    multiple_keys = 1;
+                }
+            }
+
+            /* Migrating / Importing slot? Count keys we don't have.
+             * If it is pubsubshard command, it isn't required to check
+             * the channel being present or not in the node during the
+             * slot migration, the channel will be served from the source
+             * node until the migration completes with CLUSTER SETSLOT <slot>
+             * NODE <node-id>. */
+            int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE;
+            if ((migrating_slot || importing_slot) && !is_pubsubshard)
+            {
+                if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++;
+                else existing_keys++;
+            }
+        }
+        getKeysFreeResult(&result);
+    }
+
+    /* No key at all in command? then we can serve the request
+     * without redirections or errors in all the cases. */
+    if (n == NULL) return myself;
+
+    uint64_t cmd_flags = getCommandFlags(c);
+    /* Cluster is globally down but we got keys? We only serve the request
+     * if it is a read command and when allow_reads_when_down is enabled. */
+    if (!isClusterHealthy()) {
+        if (is_pubsubshard) {
+            if (!server.cluster_allow_pubsubshard_when_down) {
+                if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
+                return NULL;
+            }
+        } else if (!server.cluster_allow_reads_when_down) {
+            /* The cluster is configured to block commands when the
+             * cluster is down. */
+            if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
+            return NULL;
+        } else if (cmd_flags & CMD_WRITE) {
+            /* The cluster is configured to allow read only commands */
+            if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE;
+            return NULL;
+        } else {
+            /* Fall through and allow the command to be executed:
+             * this happens when server.cluster_allow_reads_when_down is
+             * true and the command is not a write command */
+        }
+    }
+
+    /* Return the hashslot by reference. */
+    if (hashslot) *hashslot = slot;
+
+    /* MIGRATE always works in the context of the local node if the slot
+     * is open (migrating or importing state). We need to be able to freely
+     * move keys among instances in this case. */
+    if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand)
+        return myself;
+
+    /* If we don't have all the keys and we are migrating the slot, send
+     * an ASK redirection or TRYAGAIN. */
+    if (migrating_slot && missing_keys) {
+        /* If we have keys but we don't have all keys, we return TRYAGAIN */
+        if (existing_keys) {
+            if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
+            return NULL;
+        } else {
+            if (error_code) *error_code = CLUSTER_REDIR_ASK;
+            return getMigratingSlotDest(slot);
+        }
+    }
+
+    /* If we are receiving the slot, and the client correctly flagged the
+     * request as "ASKING", we can serve the request. However if the request
+     * involves multiple keys and we don't have them all, the only option is
+     * to send a TRYAGAIN error. */
+    if (importing_slot &&
+        (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING))
+    {
+        if (multiple_keys && missing_keys) {
+            if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
+            return NULL;
+        } else {
+            return myself;
+        }
+    }
+
+    /* Handle the read-only client case reading from a slave: if this
+     * node is a slave and the request is about a hash slot our master
+     * is serving, we can reply without redirection. */
+    int is_write_command = (cmd_flags & CMD_WRITE) ||
+                           (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE));
+    if (((c->flags & CLIENT_READONLY) || is_pubsubshard) &&
+        !is_write_command &&
+        clusterNodeIsSlave(myself) &&
+        clusterNodeGetSlaveof(myself) == n)
+    {
+        return myself;
+    }
+
+    /* Base case: just return the right node. However, if this node is not
+     * myself, set error_code to MOVED since we need to issue a redirection. */
+    if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED;
+    return n;
+}
+
+/* Send the client the right redirection code, according to error_code
+ * that should be set to one of CLUSTER_REDIR_* macros.
+ *
+ * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes
+ * are used, then the node 'n' should not be NULL, but should be the
+ * node we want to mention in the redirection. Moreover hashslot should
+ * be set to the hash slot that caused the redirection. */
+void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) {
+    if (error_code == CLUSTER_REDIR_CROSS_SLOT) {
+        addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot");
+    } else if (error_code == CLUSTER_REDIR_UNSTABLE) {
+        /* The request spawns multiple keys in the same slot,
+         * but the slot is not "stable" currently as there is
+         * a migration or import in progress. */
+        addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot");
+    } else if (error_code == CLUSTER_REDIR_DOWN_STATE) {
+        addReplyError(c,"-CLUSTERDOWN The cluster is down");
+    } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) {
+        addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands");
+    } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) {
+        addReplyError(c,"-CLUSTERDOWN Hash slot not served");
+    } else if (error_code == CLUSTER_REDIR_MOVED ||
+               error_code == CLUSTER_REDIR_ASK)
+    {
+        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
+        int port = clusterNodeClientPort(n, shouldReturnTlsInfo());
+        addReplyErrorSds(c,sdscatprintf(sdsempty(),
+                                        "-%s %d %s:%d",
+                                        (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED",
+                                        hashslot, clusterNodePreferredEndpoint(n), port));
+    } else {
+        serverPanic("getNodeByQuery() unknown error.");
+    }
+}
+
+/* This function is called by the function processing clients incrementally
+ * to detect timeouts, in order to handle the following case:
+ *
+ * 1) A client blocks with BLPOP or similar blocking operation.
+ * 2) The master migrates the hash slot elsewhere or turns into a slave.
+ * 3) The client may remain blocked forever (or up to the max timeout time)
+ *    waiting for a key change that will never happen.
+ *
+ * If the client is found to be blocked into a hash slot this node no
+ * longer handles, the client is sent a redirection error, and the function
+ * returns 1. Otherwise 0 is returned and no operation is performed. */
+int clusterRedirectBlockedClientIfNeeded(client *c) {
+    clusterNode *myself = getMyClusterNode();
+    if (c->flags & CLIENT_BLOCKED &&
+        (c->bstate.btype == BLOCKED_LIST ||
+         c->bstate.btype == BLOCKED_ZSET ||
+         c->bstate.btype == BLOCKED_STREAM ||
+         c->bstate.btype == BLOCKED_MODULE))
+    {
+        dictEntry *de;
+        dictIterator *di;
+
+        /* If the cluster is down, unblock the client with the right error.
+         * If the cluster is configured to allow reads on cluster down, we
+         * still want to emit this error since a write will be required
+         * to unblock them which may never come.  */
+        if (!isClusterHealthy()) {
+            clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE);
+            return 1;
+        }
+
+        /* If the client is blocked on module, but not on a specific key,
+         * don't unblock it (except for the CLUSTER_FAIL case above). */
+        if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c))
+            return 0;
+
+        /* All keys must belong to the same slot, so check first key only. */
+        di = dictGetIterator(c->bstate.keys);
+        if ((de = dictNext(di)) != NULL) {
+            robj *key = dictGetKey(de);
+            int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr));
+            clusterNode *node = getNodeBySlot(slot);
+
+            /* if the client is read-only and attempting to access key that our
+             * replica can handle, allow it. */
+            if ((c->flags & CLIENT_READONLY) &&
+                !(c->lastcmd->flags & CMD_WRITE) &&
+                clusterNodeIsSlave(myself) && clusterNodeGetSlaveof(myself) == node)
+            {
+                node = myself;
+            }
+
+            /* We send an error and unblock the client if:
+             * 1) The slot is unassigned, emitting a cluster down error.
+             * 2) The slot is not handled by this node, nor being imported. */
+            if (node != myself && getImportingSlotSource(slot) == NULL)
+            {
+                if (node == NULL) {
+                    clusterRedirectClient(c,NULL,0,
+                                          CLUSTER_REDIR_DOWN_UNBOUND);
+                } else {
+                    clusterRedirectClient(c,node,slot,
+                                          CLUSTER_REDIR_MOVED);
+                }
+                dictReleaseIterator(di);
+                return 1;
+            }
+        }
+        dictReleaseIterator(di);
+    }
+    return 0;
+}
+
+/* Returns an indication if the replica node is fully available
+ * and should be listed in CLUSTER SLOTS response.
+ * Returns 1 for available nodes, 0 for nodes that have
+ * not finished their initial sync, in failed state, or are
+ * otherwise considered not available to serve read commands. */
+static int isReplicaAvailable(clusterNode *node) {
+    if (clusterNodeIsFailing(node)) {
+        return 0;
+    }
+    long long repl_offset = clusterNodeReplOffset(node);
+    if (clusterNodeIsMyself(node)) {
+        /* Nodes do not update their own information
+         * in the cluster node list. */
+        repl_offset = replicationGetSlaveOffset();
+    }
+    return (repl_offset != 0);
+}
+
+void addNodeToNodeReply(client *c, clusterNode *node) {
+    char* hostname = clusterNodeHostname(node);
+    addReplyArrayLen(c, 4);
+    if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) {
+        addReplyBulkCString(c, clusterNodeIp(node));
+    } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) {
+        if (hostname != NULL && hostname[0] != '\0') {
+            addReplyBulkCString(c, hostname);
+        } else {
+            addReplyBulkCString(c, "?");
+        }
+    } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) {
+        addReplyNull(c);
+    } else {
+        serverPanic("Unrecognized preferred endpoint type");
+    }
+
+    /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
+    addReplyLongLong(c, clusterNodeClientPort(node, shouldReturnTlsInfo()));
+    addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN);
+
+    /* Add the additional endpoint information, this is all the known networking information
+     * that is not the preferred endpoint. Note the logic is evaluated twice so we can
+     * correctly report the number of additional network arguments without using a deferred
+     * map, an assertion is made at the end to check we set the right length. */
+    int length = 0;
+    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) {
+        length++;
+    }
+    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME
+        && hostname != NULL && hostname[0] != '\0')
+    {
+        length++;
+    }
+    addReplyMapLen(c, length);
+
+    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) {
+        addReplyBulkCString(c, "ip");
+        addReplyBulkCString(c, clusterNodeIp(node));
+        length--;
+    }
+    if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME
+        && hostname != NULL && hostname[0] != '\0')
+    {
+        addReplyBulkCString(c, "hostname");
+        addReplyBulkCString(c, hostname);
+        length--;
+    }
+    serverAssert(length == 0);
+}
+
+void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) {
+    int i, nested_elements = 3; /* slots (2) + master addr (1) */
+    for (i = 0; i < clusterNodeNumSlaves(node); i++) {
+        if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue;
+        nested_elements++;
+    }
+    addReplyArrayLen(c, nested_elements);
+    addReplyLongLong(c, start_slot);
+    addReplyLongLong(c, end_slot);
+    addNodeToNodeReply(c, node);
+
+    /* Remaining nodes in reply are replicas for slot range */
+    for (i = 0; i < clusterNodeNumSlaves(node); i++) {
+        /* This loop is copy/pasted from clusterGenNodeDescription()
+         * with modifications for per-slot node aggregation. */
+        if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue;
+        addNodeToNodeReply(c, clusterNodeGetSlave(node, i));
+        nested_elements--;
+    }
+    serverAssert(nested_elements == 3); /* Original 3 elements */
+}
+
+void clusterCommandSlots(client * c) {
+    /* Format: 1) 1) start slot
+     *            2) end slot
+     *            3) 1) master IP
+     *               2) master port
+     *               3) node ID
+     *            4) 1) replica IP
+     *               2) replica port
+     *               3) node ID
+     *           ... continued until done
+     */
+    clusterNode *n = NULL;
+    int num_masters = 0, start = -1;
+    void *slot_replylen = addReplyDeferredLen(c);
+
+    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
+        /* Find start node and slot id. */
+        if (n == NULL) {
+            if (i == CLUSTER_SLOTS) break;
+            n = getNodeBySlot(i);
+            start = i;
+            continue;
+        }
+
+        /* Add cluster slots info when occur different node with start
+         * or end of slot. */
+        if (i == CLUSTER_SLOTS || n != getNodeBySlot(i)) {
+            addNodeReplyForClusterSlot(c, n, start, i-1);
+            num_masters++;
+            if (i == CLUSTER_SLOTS) break;
+            n = getNodeBySlot(i);
+            start = i;
+        }
+    }
+    setDeferredArrayLen(c, slot_replylen, num_masters);
+}
+
 /* -----------------------------------------------------------------------------
  * Cluster functions related to serving / redirecting clients
  * -------------------------------------------------------------------------- */
@@ -7223,435 +1442,3 @@ void readwriteCommand(client *c) {
     c->flags &= ~CLIENT_READONLY;
     addReply(c,shared.ok);
 }
-
-/* Return the pointer to the cluster node that is able to serve the command.
- * For the function to succeed the command should only target either:
- *
- * 1) A single key (even multiple times like RPOPLPUSH mylist mylist).
- * 2) Multiple keys in the same hash slot, while the slot is stable (no
- *    resharding in progress).
- *
- * On success the function returns the node that is able to serve the request.
- * If the node is not 'myself' a redirection must be performed. The kind of
- * redirection is specified setting the integer passed by reference
- * 'error_code', which will be set to CLUSTER_REDIR_ASK or
- * CLUSTER_REDIR_MOVED.
- *
- * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE.
- *
- * If the command fails NULL is returned, and the reason of the failure is
- * provided via 'error_code', which will be set to:
- *
- * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that
- * don't belong to the same hash slot.
- *
- * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys
- * belonging to the same slot, but the slot is not stable (in migration or
- * importing state, likely because a resharding is in progress).
- *
- * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is
- * not bound to any node. In this case the cluster global state should be
- * already "down" but it is fragile to rely on the update of the global state,
- * so we also handle it here.
- *
- * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is
- * down but the user attempts to execute a command that addresses one or more keys. */
-clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
-    clusterNode *n = NULL;
-    robj *firstkey = NULL;
-    int multiple_keys = 0;
-    multiState *ms, _ms;
-    multiCmd mc;
-    int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0,
-        existing_keys = 0;
-
-    /* Allow any key to be set if a module disabled cluster redirections. */
-    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
-        return myself;
-
-    /* Set error code optimistically for the base case. */
-    if (error_code) *error_code = CLUSTER_REDIR_NONE;
-
-    /* Modules can turn off Redis Cluster redirection: this is useful
-     * when writing a module that implements a completely different
-     * distributed system. */
-
-    /* We handle all the cases as if they were EXEC commands, so we have
-     * a common code path for everything */
-    if (cmd->proc == execCommand) {
-        /* If CLIENT_MULTI flag is not set EXEC is just going to return an
-         * error. */
-        if (!(c->flags & CLIENT_MULTI)) return myself;
-        ms = &c->mstate;
-    } else {
-        /* In order to have a single codepath create a fake Multi State
-         * structure if the client is not in MULTI/EXEC state, this way
-         * we have a single codepath below. */
-        ms = &_ms;
-        _ms.commands = &mc;
-        _ms.count = 1;
-        mc.argv = argv;
-        mc.argc = argc;
-        mc.cmd = cmd;
-    }
-
-    int is_pubsubshard = cmd->proc == ssubscribeCommand ||
-            cmd->proc == sunsubscribeCommand ||
-            cmd->proc == spublishCommand;
-
-    /* Check that all the keys are in the same hash slot, and obtain this
-     * slot and the node associated. */
-    for (i = 0; i < ms->count; i++) {
-        struct redisCommand *mcmd;
-        robj **margv;
-        int margc, numkeys, j;
-        keyReference *keyindex;
-
-        mcmd = ms->commands[i].cmd;
-        margc = ms->commands[i].argc;
-        margv = ms->commands[i].argv;
-
-        getKeysResult result = GETKEYS_RESULT_INIT;
-        numkeys = getKeysFromCommand(mcmd,margv,margc,&result);
-        keyindex = result.keys;
-
-        for (j = 0; j < numkeys; j++) {
-            robj *thiskey = margv[keyindex[j].pos];
-            int thisslot = keyHashSlot((char*)thiskey->ptr,
-                                       sdslen(thiskey->ptr));
-
-            if (firstkey == NULL) {
-                /* This is the first key we see. Check what is the slot
-                 * and node. */
-                firstkey = thiskey;
-                slot = thisslot;
-                n = server.cluster->slots[slot];
-
-                /* Error: If a slot is not served, we are in "cluster down"
-                 * state. However the state is yet to be updated, so this was
-                 * not trapped earlier in processCommand(). Report the same
-                 * error to the client. */
-                if (n == NULL) {
-                    getKeysFreeResult(&result);
-                    if (error_code)
-                        *error_code = CLUSTER_REDIR_DOWN_UNBOUND;
-                    return NULL;
-                }
-
-                /* If we are migrating or importing this slot, we need to check
-                 * if we have all the keys in the request (the only way we
-                 * can safely serve the request, otherwise we return a TRYAGAIN
-                 * error). To do so we set the importing/migrating state and
-                 * increment a counter for every missing key. */
-                if (n == myself &&
-                    server.cluster->migrating_slots_to[slot] != NULL)
-                {
-                    migrating_slot = 1;
-                } else if (server.cluster->importing_slots_from[slot] != NULL) {
-                    importing_slot = 1;
-                }
-            } else {
-                /* If it is not the first key/channel, make sure it is exactly
-                 * the same key/channel as the first we saw. */
-                if (slot != thisslot) {
-                    /* Error: multiple keys from different slots. */
-                    getKeysFreeResult(&result);
-                    if (error_code)
-                        *error_code = CLUSTER_REDIR_CROSS_SLOT;
-                    return NULL;                  
-                }
-                if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) {
-                    /* Flag this request as one with multiple different
-                     * keys/channels when the slot is in importing state. */
-                    multiple_keys = 1;
-                }
-            }
-
-            /* Migrating / Importing slot? Count keys we don't have.
-             * If it is pubsubshard command, it isn't required to check
-             * the channel being present or not in the node during the
-             * slot migration, the channel will be served from the source
-             * node until the migration completes with CLUSTER SETSLOT <slot>
-             * NODE <node-id>. */
-            int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE;
-            if ((migrating_slot || importing_slot) && !is_pubsubshard)
-            {
-                if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++;
-                else existing_keys++;
-            }
-        }
-        getKeysFreeResult(&result);
-    }
-
-    /* No key at all in command? then we can serve the request
-     * without redirections or errors in all the cases. */
-    if (n == NULL) return myself;
-
-    uint64_t cmd_flags = getCommandFlags(c);
-    /* Cluster is globally down but we got keys? We only serve the request
-     * if it is a read command and when allow_reads_when_down is enabled. */
-    if (server.cluster->state != CLUSTER_OK) {
-        if (is_pubsubshard) {
-            if (!server.cluster_allow_pubsubshard_when_down) {
-                if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
-                return NULL;
-            }
-        } else if (!server.cluster_allow_reads_when_down) {
-            /* The cluster is configured to block commands when the
-             * cluster is down. */
-            if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
-            return NULL;
-        } else if (cmd_flags & CMD_WRITE) {
-            /* The cluster is configured to allow read only commands */
-            if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE;
-            return NULL;
-        } else {
-            /* Fall through and allow the command to be executed:
-             * this happens when server.cluster_allow_reads_when_down is
-             * true and the command is not a write command */
-        }
-    }
-
-    /* Return the hashslot by reference. */
-    if (hashslot) *hashslot = slot;
-
-    /* MIGRATE always works in the context of the local node if the slot
-     * is open (migrating or importing state). We need to be able to freely
-     * move keys among instances in this case. */
-    if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand)
-        return myself;
-
-    /* If we don't have all the keys and we are migrating the slot, send
-     * an ASK redirection or TRYAGAIN. */
-    if (migrating_slot && missing_keys) {
-        /* If we have keys but we don't have all keys, we return TRYAGAIN */
-        if (existing_keys) {
-            if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
-            return NULL;
-        } else {
-            if (error_code) *error_code = CLUSTER_REDIR_ASK;
-            return server.cluster->migrating_slots_to[slot];
-        }
-    }
-
-    /* If we are receiving the slot, and the client correctly flagged the
-     * request as "ASKING", we can serve the request. However if the request
-     * involves multiple keys and we don't have them all, the only option is
-     * to send a TRYAGAIN error. */
-    if (importing_slot &&
-        (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING))
-    {
-        if (multiple_keys && missing_keys) {
-            if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
-            return NULL;
-        } else {
-            return myself;
-        }
-    }
-
-    /* Handle the read-only client case reading from a slave: if this
-     * node is a slave and the request is about a hash slot our master
-     * is serving, we can reply without redirection. */
-    int is_write_command = (cmd_flags & CMD_WRITE) ||
-                           (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE));
-    if (((c->flags & CLIENT_READONLY) || is_pubsubshard) &&
-        !is_write_command &&
-        nodeIsSlave(myself) &&
-        myself->slaveof == n)
-    {
-        return myself;
-    }
-
-    /* Base case: just return the right node. However if this node is not
-     * myself, set error_code to MOVED since we need to issue a redirection. */
-    if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED;
-    return n;
-}
-
-/* Send the client the right redirection code, according to error_code
- * that should be set to one of CLUSTER_REDIR_* macros.
- *
- * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes
- * are used, then the node 'n' should not be NULL, but should be the
- * node we want to mention in the redirection. Moreover hashslot should
- * be set to the hash slot that caused the redirection. */
-void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) {
-    if (error_code == CLUSTER_REDIR_CROSS_SLOT) {
-        addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot");
-    } else if (error_code == CLUSTER_REDIR_UNSTABLE) {
-        /* The request spawns multiple keys in the same slot,
-         * but the slot is not "stable" currently as there is
-         * a migration or import in progress. */
-        addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot");
-    } else if (error_code == CLUSTER_REDIR_DOWN_STATE) {
-        addReplyError(c,"-CLUSTERDOWN The cluster is down");
-    } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) {
-        addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands");
-    } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) {
-        addReplyError(c,"-CLUSTERDOWN Hash slot not served");
-    } else if (error_code == CLUSTER_REDIR_MOVED ||
-               error_code == CLUSTER_REDIR_ASK)
-    {
-        /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */
-        int port = getNodeClientPort(n, shouldReturnTlsInfo());
-        addReplyErrorSds(c,sdscatprintf(sdsempty(),
-            "-%s %d %s:%d",
-            (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED",
-            hashslot, getPreferredEndpoint(n), port));
-    } else {
-        serverPanic("getNodeByQuery() unknown error.");
-    }
-}
-
-/* This function is called by the function processing clients incrementally
- * to detect timeouts, in order to handle the following case:
- *
- * 1) A client blocks with BLPOP or similar blocking operation.
- * 2) The master migrates the hash slot elsewhere or turns into a slave.
- * 3) The client may remain blocked forever (or up to the max timeout time)
- *    waiting for a key change that will never happen.
- *
- * If the client is found to be blocked into a hash slot this node no
- * longer handles, the client is sent a redirection error, and the function
- * returns 1. Otherwise 0 is returned and no operation is performed. */
-int clusterRedirectBlockedClientIfNeeded(client *c) {
-    if (c->flags & CLIENT_BLOCKED &&
-        (c->bstate.btype == BLOCKED_LIST ||
-         c->bstate.btype == BLOCKED_ZSET ||
-         c->bstate.btype == BLOCKED_STREAM ||
-         c->bstate.btype == BLOCKED_MODULE))
-    {
-        dictEntry *de;
-        dictIterator *di;
-
-        /* If the cluster is down, unblock the client with the right error.
-         * If the cluster is configured to allow reads on cluster down, we
-         * still want to emit this error since a write will be required
-         * to unblock them which may never come.  */
-        if (server.cluster->state == CLUSTER_FAIL) {
-            clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE);
-            return 1;
-        }
-
-        /* If the client is blocked on module, but not on a specific key,
-         * don't unblock it (except for the CLUSTER_FAIL case above). */
-        if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c))
-            return 0;
-
-        /* All keys must belong to the same slot, so check first key only. */
-        di = dictGetIterator(c->bstate.keys);
-        if ((de = dictNext(di)) != NULL) {
-            robj *key = dictGetKey(de);
-            int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr));
-            clusterNode *node = server.cluster->slots[slot];
-
-            /* if the client is read-only and attempting to access key that our
-             * replica can handle, allow it. */
-            if ((c->flags & CLIENT_READONLY) &&
-                !(c->lastcmd->flags & CMD_WRITE) &&
-                nodeIsSlave(myself) && myself->slaveof == node)
-            {
-                node = myself;
-            }
-
-            /* We send an error and unblock the client if:
-             * 1) The slot is unassigned, emitting a cluster down error.
-             * 2) The slot is not handled by this node, nor being imported. */
-            if (node != myself &&
-                server.cluster->importing_slots_from[slot] == NULL)
-            {
-                if (node == NULL) {
-                    clusterRedirectClient(c,NULL,0,
-                        CLUSTER_REDIR_DOWN_UNBOUND);
-                } else {
-                    clusterRedirectClient(c,node,slot,
-                        CLUSTER_REDIR_MOVED);
-                }
-                dictReleaseIterator(di);
-                return 1;
-            }
-        }
-        dictReleaseIterator(di);
-    }
-    return 0;
-}
-
-/* Remove all the keys in the specified hash slot.
- * The number of removed items is returned. */
-unsigned int delKeysInSlot(unsigned int hashslot) {
-    unsigned int j = 0;
-
-    dictIterator *iter = NULL;
-    dictEntry *de = NULL;
-    iter = dictGetSafeIterator(server.db->dict[hashslot]);
-    while((de = dictNext(iter)) != NULL) {
-        sds sdskey = dictGetKey(de);
-        robj *key = createStringObject(sdskey, sdslen(sdskey));
-        dbDelete(&server.db[0], key);
-        propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del);
-        signalModifiedKey(NULL, &server.db[0], key);
-        /* The keys are not actually logically deleted from the database, just moved to another node.
-         * The modules needs to know that these keys are no longer available locally, so just send the
-         * keyspace notification to the modules, but not to clients. */
-        moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id);
-        postExecutionUnitOperations();
-        decrRefCount(key);
-        j++;
-        server.dirty++;
-    }
-    dictReleaseIterator(iter);
-
-    return j;
-}
-
-unsigned int countKeysInSlot(unsigned int slot) {
-    return dictSize(server.db->dict[slot]);
-}
-
-/* -----------------------------------------------------------------------------
- * Operation(s) on channel rax tree.
- * -------------------------------------------------------------------------- */
-
-void slotToChannelUpdate(sds channel, int add) {
-    size_t keylen = sdslen(channel);
-    unsigned int hashslot = keyHashSlot(channel,keylen);
-    unsigned char buf[64];
-    unsigned char *indexed = buf;
-
-    if (keylen+2 > 64) indexed = zmalloc(keylen+2);
-    indexed[0] = (hashslot >> 8) & 0xff;
-    indexed[1] = hashslot & 0xff;
-    memcpy(indexed+2,channel,keylen);
-    if (add) {
-        raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL);
-    } else {
-        raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL);
-    }
-    if (indexed != buf) zfree(indexed);
-}
-
-void slotToChannelAdd(sds channel) {
-    slotToChannelUpdate(channel,1);
-}
-
-void slotToChannelDel(sds channel) {
-    slotToChannelUpdate(channel,0);
-}
-
-/* Get the count of the channels for a given slot. */
-unsigned int countChannelsInSlot(unsigned int hashslot) {
-    raxIterator iter;
-    int j = 0;
-    unsigned char indexed[2];
-
-    indexed[0] = (hashslot >> 8) & 0xff;
-    indexed[1] = hashslot & 0xff;
-    raxStart(&iter,server.cluster->slots_to_channels);
-    raxSeek(&iter,">=",indexed,2);
-    while(raxNext(&iter)) {
-        if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break;
-        j++;
-    }
-    raxStop(&iter);
-    return j;
-}
diff --git a/src/cluster.h b/src/cluster.h
index 0340349b2..02c5f67f3 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -2,7 +2,7 @@
 #define __CLUSTER_H
 
 /*-----------------------------------------------------------------------------
- * Redis cluster data structures, defines, exported API.
+ * Redis cluster exported API.
  *----------------------------------------------------------------------------*/
 
 #define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */
@@ -11,15 +11,6 @@
 #define CLUSTER_OK 0            /* Everything looks ok */
 #define CLUSTER_FAIL 1          /* The cluster can't work */
 #define CLUSTER_NAMELEN 40      /* sha1 hex length */
-#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
-
-/* The following defines are amount of time, sometimes expressed as
- * multiplicators of the node timeout value (when ending with MULT). */
-#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
-#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
-#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
-#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
-#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
 
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
@@ -31,77 +22,8 @@
 #define CLUSTER_REDIR_DOWN_UNBOUND 6  /* -CLUSTERDOWN, unbound slot. */
 #define CLUSTER_REDIR_DOWN_RO_STATE 7 /* -CLUSTERDOWN, allow reads. */
 
-struct clusterNode;
-
-/* clusterLink encapsulates everything needed to talk with a remote node. */
-typedef struct clusterLink {
-    mstime_t ctime;             /* Link creation time */
-    connection *conn;           /* Connection to remote node */
-    list *send_msg_queue;        /* List of messages to be sent */
-    size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */
-    unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */
-    char *rcvbuf;               /* Packet reception buffer */
-    size_t rcvbuf_len;          /* Used size of rcvbuf */
-    size_t rcvbuf_alloc;        /* Allocated size of rcvbuf */
-    struct clusterNode *node;   /* Node related to this link. Initialized to NULL when unknown */
-    int inbound;                /* 1 if this link is an inbound link accepted from the related node */
-} clusterLink;
-
-/* Cluster node flags and macros. */
-#define CLUSTER_NODE_MASTER 1     /* The node is a master */
-#define CLUSTER_NODE_SLAVE 2      /* The node is a slave */
-#define CLUSTER_NODE_PFAIL 4      /* Failure? Need acknowledge */
-#define CLUSTER_NODE_FAIL 8       /* The node is believed to be malfunctioning */
-#define CLUSTER_NODE_MYSELF 16    /* This node is myself */
-#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
-#define CLUSTER_NODE_NOADDR   64  /* We don't know the address of this node */
-#define CLUSTER_NODE_MEET 128     /* Send a MEET message to this node */
-#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */
-#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */
-#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
-
-#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
-#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
-#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
-#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
-#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR)
-#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
-#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
-#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
-
-/* Reasons why a slave is not able to failover. */
-#define CLUSTER_CANT_FAILOVER_NONE 0
-#define CLUSTER_CANT_FAILOVER_DATA_AGE 1
-#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
-#define CLUSTER_CANT_FAILOVER_EXPIRED 3
-#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
-#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */
-
-/* clusterState todo_before_sleep flags. */
-#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
-#define CLUSTER_TODO_UPDATE_STATE (1<<1)
-#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
-#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
-#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4)
-
-/* Message types.
- *
- * Note that the PING, PONG and MEET messages are actually the same exact
- * kind of packet. PONG is the reply to ping, in the exact format as a PING,
- * while MEET is a special PING that forces the receiver to add the sender
- * as a node (if it is not already in the list). */
-#define CLUSTERMSG_TYPE_PING 0          /* Ping */
-#define CLUSTERMSG_TYPE_PONG 1          /* Pong (reply to Ping) */
-#define CLUSTERMSG_TYPE_MEET 2          /* Meet "let's join" message */
-#define CLUSTERMSG_TYPE_FAIL 3          /* Mark node xxx as failing */
-#define CLUSTERMSG_TYPE_PUBLISH 4       /* Pub/Sub Publish propagation */
-#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
-#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6     /* Yes, you have my vote */
-#define CLUSTERMSG_TYPE_UPDATE 7        /* Another node slots configuration */
-#define CLUSTERMSG_TYPE_MFSTART 8       /* Pause clients for manual failover */
-#define CLUSTERMSG_TYPE_MODULE 9        /* Module cluster API message. */
-#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */
-#define CLUSTERMSG_TYPE_COUNT 11        /* Total number of message types. */
+typedef struct _clusterNode clusterNode;
+struct clusterState;
 
 /* Flags that a module can set in order to prevent certain Redis Cluster
  * features to be enabled. Useful when implementing a different distributed
@@ -110,312 +32,87 @@ typedef struct clusterLink {
 #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1)
 #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2)
 
-/* This structure represent elements of node->fail_reports. */
-typedef struct clusterNodeFailReport {
-    struct clusterNode *node;  /* Node reporting the failure condition. */
-    mstime_t time;             /* Time of the last report from this node. */
-} clusterNodeFailReport;
-
-typedef struct clusterNode {
-    mstime_t ctime; /* Node object creation time. */
-    char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
-    char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */
-    int flags;      /* CLUSTER_NODE_... */
-    uint64_t configEpoch; /* Last configEpoch observed for this node */
-    unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
-    uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */
-    int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */
-    int numslots;   /* Number of slots handled by this node */
-    int numslaves;  /* Number of slave nodes, if this is a master */
-    struct clusterNode **slaves; /* pointers to slave nodes */
-    struct clusterNode *slaveof; /* pointer to the master node. Note that it
-                                    may be NULL even if the node is a slave
-                                    if we don't have the master node in our
-                                    tables. */
-    unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */
-    mstime_t ping_sent;      /* Unix time we sent latest ping */
-    mstime_t pong_received;  /* Unix time we received the pong */
-    mstime_t data_received;  /* Unix time we received any data */
-    mstime_t fail_time;      /* Unix time when FAIL flag was set */
-    mstime_t voted_time;     /* Last time we voted for a slave of this master */
-    mstime_t repl_offset_time;  /* Unix time we received offset for this node */
-    mstime_t orphaned_time;     /* Starting time of orphaned master condition */
-    long long repl_offset;      /* Last known repl offset for this node. */
-    char ip[NET_IP_STR_LEN];    /* Latest known IP address of this node */
-    sds hostname;               /* The known hostname for this node */
-    sds human_nodename;         /* The known human readable nodename for this node */
-    int tcp_port;               /* Latest known clients TCP port. */
-    int tls_port;               /* Latest known clients TLS port */
-    int cport;                  /* Latest known cluster port of this node. */
-    clusterLink *link;          /* TCP/IP link established toward this node */
-    clusterLink *inbound_link;  /* TCP/IP link accepted from this node */
-    list *fail_reports;         /* List of nodes signaling this as failing */
-} clusterNode;
-
-typedef struct clusterState {
-    clusterNode *myself;  /* This node */
-    uint64_t currentEpoch;
-    int state;            /* CLUSTER_OK, CLUSTER_FAIL, ... */
-    int size;             /* Num of master nodes with at least one slot */
-    dict *nodes;          /* Hash table of name -> clusterNode structures */
-    dict *shards;         /* Hash table of shard_id -> list (of nodes) structures */
-    dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
-    clusterNode *migrating_slots_to[CLUSTER_SLOTS];
-    clusterNode *importing_slots_from[CLUSTER_SLOTS];
-    clusterNode *slots[CLUSTER_SLOTS];
-    rax *slots_to_channels;
-    /* The following fields are used to take the slave state on elections. */
-    mstime_t failover_auth_time; /* Time of previous or next election. */
-    int failover_auth_count;    /* Number of votes received so far. */
-    int failover_auth_sent;     /* True if we already asked for votes. */
-    int failover_auth_rank;     /* This slave rank for current auth request. */
-    uint64_t failover_auth_epoch; /* Epoch of the current election. */
-    int cant_failover_reason;   /* Why a slave is currently not able to
-                                   failover. See the CANT_FAILOVER_* macros. */
-    /* Manual failover state in common. */
-    mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
-                                   It is zero if there is no MF in progress. */
-    /* Manual failover state of master. */
-    clusterNode *mf_slave;      /* Slave performing the manual failover. */
-    /* Manual failover state of slave. */
-    long long mf_master_offset; /* Master offset the slave needs to start MF
-                                   or -1 if still not received. */
-    int mf_can_start;           /* If non-zero signal that the manual failover
-                                   can start requesting masters vote. */
-    /* The following fields are used by masters to take state on elections. */
-    uint64_t lastVoteEpoch;     /* Epoch of the last vote granted. */
-    int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
-    /* Stats */
-    /* Messages received and sent by type. */
-    long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
-    long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
-    long long stats_pfail_nodes;    /* Number of nodes in PFAIL status,
-                                       excluding nodes without address. */
-    unsigned long long stat_cluster_links_buffer_limit_exceeded;  /* Total number of cluster links freed due to exceeding buffer limit */
-
-    /* Bit map for slots that are no longer claimed by the owner in cluster PING
-     * messages. During slot migration, the owner will stop claiming the slot after
-     * the ownership transfer. Set the bit corresponding to the slot when a node
-     * stops claiming the slot. This prevents spreading incorrect information (that
-     * source still owns the slot) using UPDATE messages. */
-    unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8];
-} clusterState;
-
-/* Redis cluster messages header */
-
-/* Initially we don't know our "name", but we'll find it once we connect
- * to the first node, using the getsockname() function. Then we'll use this
- * address for all the next messages. */
-typedef struct {
-    char nodename[CLUSTER_NAMELEN];
-    uint32_t ping_sent;
-    uint32_t pong_received;
-    char ip[NET_IP_STR_LEN];  /* IP address last time it was seen */
-    uint16_t port;              /* primary port last time it was seen */
-    uint16_t cport;             /* cluster port last time it was seen */
-    uint16_t flags;             /* node->flags copy */
-    uint16_t pport;             /* secondary port last time it was seen */
-    uint16_t notused1;
-} clusterMsgDataGossip;
-
-typedef struct {
-    char nodename[CLUSTER_NAMELEN];
-} clusterMsgDataFail;
-
-typedef struct {
-    uint32_t channel_len;
-    uint32_t message_len;
-    unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
-} clusterMsgDataPublish;
-
-typedef struct {
-    uint64_t configEpoch; /* Config epoch of the specified instance. */
-    char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
-    unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */
-} clusterMsgDataUpdate;
-
-typedef struct {
-    uint64_t module_id;     /* ID of the sender module. */
-    uint32_t len;           /* ID of the sender module. */
-    uint8_t type;           /* Type from 0 to 255. */
-    unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
-} clusterMsgModule;
-
-/* The cluster supports optional extension messages that can be sent
- * along with ping/pong/meet messages to give additional info in a 
- * consistent manner. */
-typedef enum {
-    CLUSTERMSG_EXT_TYPE_HOSTNAME,
-    CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME,
-    CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE,
-    CLUSTERMSG_EXT_TYPE_SHARDID,
-} clusterMsgPingtypes; 
-
-/* Helper function for making sure extensions are eight byte aligned. */
-#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8)
-
-typedef struct {
-    char hostname[1]; /* The announced hostname, ends with \0. */
-} clusterMsgPingExtHostname;
-
-typedef struct {
-    char human_nodename[1]; /* The announced nodename, ends with \0. */
-} clusterMsgPingExtHumanNodename;
-
-typedef struct {
-    char name[CLUSTER_NAMELEN]; /* Node name. */
-    uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */
-} clusterMsgPingExtForgottenNode;
-
-static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, "");
-
-typedef struct {
-    char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */
-} clusterMsgPingExtShardId;
-
-typedef struct {
-    uint32_t length; /* Total length of this extension message (including this header) */
-    uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */
-    uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */
-    union {
-        clusterMsgPingExtHostname hostname;
-        clusterMsgPingExtHumanNodename human_nodename;
-        clusterMsgPingExtForgottenNode forgotten_node;
-        clusterMsgPingExtShardId shard_id;
-    } ext[]; /* Actual extension information, formatted so that the data is 8 
-              * byte aligned, regardless of its content. */
-} clusterMsgPingExt;
-
-union clusterMsgData {
-    /* PING, MEET and PONG */
-    struct {
-        /* Array of N clusterMsgDataGossip structures */
-        clusterMsgDataGossip gossip[1];
-        /* Extension data that can optionally be sent for ping/meet/pong
-         * messages. We can't explicitly define them here though, since
-         * the gossip array isn't the real length of the gossip data. */
-    } ping;
-
-    /* FAIL */
-    struct {
-        clusterMsgDataFail about;
-    } fail;
-
-    /* PUBLISH */
-    struct {
-        clusterMsgDataPublish msg;
-    } publish;
-
-    /* UPDATE */
-    struct {
-        clusterMsgDataUpdate nodecfg;
-    } update;
-
-    /* MODULE */
-    struct {
-        clusterMsgModule msg;
-    } module;
-};
-
-#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
-
-typedef struct {
-    char sig[4];        /* Signature "RCmb" (Redis Cluster message bus). */
-    uint32_t totlen;    /* Total length of this message */
-    uint16_t ver;       /* Protocol version, currently set to 1. */
-    uint16_t port;      /* Primary port number (TCP or TLS). */
-    uint16_t type;      /* Message type */
-    uint16_t count;     /* Only used for some kind of messages. */
-    uint64_t currentEpoch;  /* The epoch accordingly to the sending node. */
-    uint64_t configEpoch;   /* The config epoch if it's a master, or the last
-                               epoch advertised by its master if it is a
-                               slave. */
-    uint64_t offset;    /* Master replication offset if node is a master or
-                           processed replication offset if node is a slave. */
-    char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
-    unsigned char myslots[CLUSTER_SLOTS/8];
-    char slaveof[CLUSTER_NAMELEN];
-    char myip[NET_IP_STR_LEN];    /* Sender IP, if not all zeroed. */
-    uint16_t extensions; /* Number of extensions sent along with this packet. */
-    char notused1[30];   /* 30 bytes reserved for future usage. */
-    uint16_t pport;      /* Secondary port number: if primary port is TCP port, this is 
-                            TLS port, and if primary port is TLS port, this is TCP port.*/
-    uint16_t cport;      /* Sender TCP cluster bus port */
-    uint16_t flags;      /* Sender node flags */
-    unsigned char state; /* Cluster state from the POV of the sender */
-    unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
-    union clusterMsgData data;
-} clusterMsg;
-
-/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster
- * members, which can be running different versions of redis-server bits,
- * especially during cluster rolling upgrades.
- *
- * Therefore, fields in this struct should remain at the same offset from
- * release to release. The static asserts below ensures that incompatible
- * changes in clusterMsg be caught at compile time.
- */
-
-static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset");
-static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset");
-static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset");
-static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset");
-static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset");
-static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset");
-static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset");
-static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset");
-static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset");
-static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset");
-static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset");
-static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset");
-static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset");
-static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset");
-static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset");
-static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset");
-static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset");
-static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset");
-static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset");
-static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset");
-static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset");
-
-#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
-
-/* Message flags better specify the packet content or are used to
- * provide some information about the node state. */
-#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
-#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
-                                            master is up. */
-#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */
-
 /* ---------------------- API exported outside cluster.c -------------------- */
+/* functions requiring mechanism specific implementations */
 void clusterInit(void);
-void clusterInitListeners(void);
+void clusterInitLast(void);
 void clusterCron(void);
 void clusterBeforeSleep(void);
-clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
-int verifyClusterNodeId(const char *name, int length);
+int verifyClusterConfigWithData(void);
+
+int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len);
+
+void clusterUpdateMyselfFlags(void);
+void clusterUpdateMyselfIp(void);
+void clusterUpdateMyselfHostname(void);
+void clusterUpdateMyselfAnnouncedPorts(void);
+void clusterUpdateMyselfHumanNodename(void);
+
+void slotToChannelAdd(sds channel);
+void slotToChannelDel(sds channel);
+void clusterPropagatePublish(robj *channel, robj *message, int sharded);
+
+unsigned long getClusterConnectionsCount(void);
+int isClusterHealthy(void);
+
+sds clusterGenNodesDescription(client *c, int filter, int tls_primary);
+sds genClusterInfoString(void);
+/* handle implementation specific debug cluster commands. Return 1 if handled, 0 otherwise. */
+int handleDebugClusterCommand(client *c);
+const char **clusterDebugCommandExtendedHelp(void);
+/* handle implementation specific cluster commands. Return 1 if handled, 0 otherwise. */
+int clusterCommandSpecial(client *c);
+const char** clusterCommandExtendedHelp(void);
+
+int clusterAllowFailoverCmd(client *c);
+void clusterPromoteSelfToMaster(void);
+int clusterManualFailoverTimeLimit(void);
+
+void clusterCommandSlots(client * c);
+void clusterCommandMyId(client *c);
+void clusterCommandMyShardId(client *c);
+void clusterCommandShards(client *c);
+sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary);
+
+int clusterNodeCoversSlot(clusterNode *n, int slot);
+int getNodeDefaultClientPort(clusterNode *n);
+int clusterNodeIsMyself(clusterNode *n);
+clusterNode *getMyClusterNode(void);
+char *getMyClusterId(void);
+int getClusterSize(void);
+int handleDebugClusterCommand(client *c);
+int clusterNodePending(clusterNode  *node);
+int clusterNodeIsMaster(clusterNode *n);
+char **getClusterNodesList(size_t *numnodes);
+int clusterNodeIsMaster(clusterNode *n);
+char *clusterNodeIp(clusterNode *node);
+int clusterNodeIsSlave(clusterNode *node);
+clusterNode *clusterNodeGetSlaveof(clusterNode *node);
+char *clusterNodeGetName(clusterNode *node);
+int clusterNodeTimedOut(clusterNode *node);
+int clusterNodeIsFailing(clusterNode *node);
+int clusterNodeIsNoFailover(clusterNode *node);
+char *clusterNodeGetShardId(clusterNode *node);
+int clusterNodeNumSlaves(clusterNode *node);
+clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx);
+clusterNode *getMigratingSlotDest(int slot);
+clusterNode *getImportingSlotSource(int slot);
+clusterNode *getNodeBySlot(int slot);
+int clusterNodeClientPort(clusterNode *n, int use_tls);
+char *clusterNodeHostname(clusterNode *node);
+const char *clusterNodePreferredEndpoint(clusterNode *n);
+long long clusterNodeReplOffset(clusterNode *node);
 clusterNode *clusterLookupNode(const char *name, int length);
+
+/* functions with shared implementations */
+clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
 int clusterRedirectBlockedClientIfNeeded(client *c);
 void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code);
 void migrateCloseTimedoutSockets(void);
-int verifyClusterConfigWithData(void);
-unsigned long getClusterConnectionsCount(void);
-int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len);
-void clusterPropagatePublish(robj *channel, robj *message, int sharded);
 unsigned int keyHashSlot(char *key, int keylen);
 int patternHashSlot(char *pattern, int length);
-void clusterUpdateMyselfFlags(void);
-void clusterUpdateMyselfIp(void);
-void slotToChannelAdd(sds channel);
-void slotToChannelDel(sds channel);
-void clusterUpdateMyselfHostname(void);
-void clusterUpdateMyselfAnnouncedPorts(void);
-sds clusterGenNodesDescription(client *c, int filter, int tls_primary);
-sds genClusterInfoString(void);
-void freeClusterLink(clusterLink *link);
-int clusterNodeGetSlotBit(clusterNode *n, int slot);
-void clusterUpdateMyselfHumanNodename(void);
 int isValidAuxString(char *s, unsigned int length);
-int getNodeDefaultClientPort(clusterNode *n);
-
+void migrateCommand(client *c);
+void clusterCommand(client *c);
+ConnectionType *connTypeOfCluster(void);
 #endif /* __CLUSTER_H */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
new file mode 100644
index 000000000..1f957c99d
--- /dev/null
+++ b/src/cluster_legacy.c
@@ -0,0 +1,6426 @@
+/* Redis Cluster implementation.
+ *
+ * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of Redis nor the names of its contributors may be used
+ *     to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+#include "cluster.h"
+#include "cluster_legacy.h"
+#include "endianconv.h"
+#include "connection.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <sys/file.h>
+
+/* A global reference to myself is handy to make code more clear.
+ * Myself always points to server.cluster->myself, that is, the clusterNode
+ * that represents this node. */
+clusterNode *myself = NULL;
+
+clusterNode *createClusterNode(char *nodename, int flags);
+void clusterAddNode(clusterNode *node);
+void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
+void clusterReadHandler(connection *conn);
+void clusterSendPing(clusterLink *link, int type);
+void clusterSendFail(char *nodename);
+void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request);
+void clusterUpdateState(void);
+int clusterNodeCoversSlot(clusterNode *n, int slot);
+list *clusterGetNodesInMyShard(clusterNode *node);
+int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
+int clusterAddSlot(clusterNode *n, int slot);
+int clusterDelSlot(int slot);
+int clusterDelNodeSlots(clusterNode *node);
+int clusterNodeSetSlotBit(clusterNode *n, int slot);
+void clusterSetMaster(clusterNode *n);
+void clusterHandleSlaveFailover(void);
+void clusterHandleSlaveMigration(int max_slaves);
+int bitmapTestBit(unsigned char *bitmap, int pos);
+void bitmapSetBit(unsigned char *bitmap, int pos);
+void bitmapClearBit(unsigned char *bitmap, int pos);
+void clusterDoBeforeSleep(int flags);
+void clusterSendUpdate(clusterLink *link, clusterNode *node);
+void resetManualFailover(void);
+void clusterCloseAllSlots(void);
+void clusterSetNodeAsMaster(clusterNode *n);
+void clusterDelNode(clusterNode *delnode);
+sds representClusterNodeFlags(sds ci, uint16_t flags);
+sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count);
+void clusterFreeNodesSlotsInfo(clusterNode *n);
+uint64_t clusterGetMaxEpoch(void);
+int clusterBumpConfigEpochWithoutConsensus(void);
+void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len);
+const char *clusterGetMessageTypeString(int type);
+void removeChannelsInSlot(unsigned int slot);
+unsigned int countKeysInSlot(unsigned int hashslot);
+unsigned int countChannelsInSlot(unsigned int hashslot);
+unsigned int delKeysInSlot(unsigned int hashslot);
+void clusterAddNodeToShard(const char *shard_id, clusterNode *node);
+list *clusterLookupNodeListByShardId(const char *shard_id);
+void clusterRemoveNodeFromShard(clusterNode *node);
+int auxShardIdSetter(clusterNode *n, void *value, int length);
+sds auxShardIdGetter(clusterNode *n, sds s);
+int auxShardIdPresent(clusterNode *n);
+int auxHumanNodenameSetter(clusterNode *n, void *value, int length);
+sds auxHumanNodenameGetter(clusterNode *n, sds s);
+int auxHumanNodenamePresent(clusterNode *n);
+int auxTcpPortSetter(clusterNode *n, void *value, int length);
+sds auxTcpPortGetter(clusterNode *n, sds s);
+int auxTcpPortPresent(clusterNode *n);
+int auxTlsPortSetter(clusterNode *n, void *value, int length);
+sds auxTlsPortGetter(clusterNode *n, sds s);
+int auxTlsPortPresent(clusterNode *n);
+static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen);
+void freeClusterLink(clusterLink *link);
+int verifyClusterNodeId(const char *name, int length);
+
+int getNodeDefaultClientPort(clusterNode *n) {
+    return server.tls_cluster ? n->tls_port : n->tcp_port;
+}
+
+static inline int getNodeDefaultReplicationPort(clusterNode *n) {
+    return server.tls_replication ? n->tls_port : n->tcp_port;
+}
+
+int clusterNodeClientPort(clusterNode *n, int use_tls) {
+    return use_tls ? n->tls_port : n->tcp_port;
+}
+
+static inline int defaultClientPort(void) {
+    return server.tls_cluster ? server.tls_port : server.port;
+}
+
+#define isSlotUnclaimed(slot) \
+    (server.cluster->slots[slot] == NULL || \
+        bitmapTestBit(server.cluster->owner_not_claiming_slot, slot))
+
+#define RCVBUF_INIT_LEN 1024
+#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */
+
+/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to
+ * clusterNode structures. */
+dictType clusterNodesDictType = {
+        dictSdsHash,                /* hash function */
+        NULL,                       /* key dup */
+        NULL,                       /* val dup */
+        dictSdsKeyCompare,          /* key compare */
+        dictSdsDestructor,          /* key destructor */
+        NULL,                       /* val destructor */
+        NULL                        /* allow to expand */
+};
+
+/* Cluster re-addition blacklist. This maps node IDs to the time
+ * we can re-add this node. The goal is to avoid reading a removed
+ * node for some time. */
+dictType clusterNodesBlackListDictType = {
+        dictSdsCaseHash,            /* hash function */
+        NULL,                       /* key dup */
+        NULL,                       /* val dup */
+        dictSdsKeyCaseCompare,      /* key compare */
+        dictSdsDestructor,          /* key destructor */
+        NULL,                       /* val destructor */
+        NULL                        /* allow to expand */
+};
+
+/* Cluster shards hash table, mapping shard id to list of nodes */
+dictType clusterSdsToListType = {
+        dictSdsHash,                /* hash function */
+        NULL,                       /* key dup */
+        NULL,                       /* val dup */
+        dictSdsKeyCompare,          /* key compare */
+        dictSdsDestructor,          /* key destructor */
+        dictListDestructor,         /* val destructor */
+        NULL                        /* allow to expand */
+};
+
+/* Aux fields are introduced in Redis 7.2 to support the persistence
+ * of various important node properties, such as shard id, in nodes.conf.
+ * Aux fields take an explicit format of name=value pairs and have no
+ * intrinsic order among them. Aux fields are always grouped together
+ * at the end of the second column of each row after the node's IP
+ * address/port/cluster_port and the optional hostname. Aux fields
+ * are separated by ','. */
+
+/* Aux field setter function prototype
+ * return C_OK when the update is successful; C_ERR otherwise */
+typedef int (aux_value_setter) (clusterNode* n, void *value, int length);
+/* Aux field getter function prototype
+ * return an sds that is a concatenation of the input sds string and
+ * the aux value */
+typedef sds (aux_value_getter) (clusterNode* n, sds s);
+
+typedef int (aux_value_present) (clusterNode* n);
+
+typedef struct {
+    char *field;
+    aux_value_setter *setter;
+    aux_value_getter *getter;
+    aux_value_present *isPresent;
+} auxFieldHandler;
+
+/* Assign index to each aux field */
+typedef enum {
+    af_shard_id,
+    af_human_nodename,
+    af_tcp_port,
+    af_tls_port,
+    af_count,
+} auxFieldIndex;
+
+/* Note that
+ * 1. the order of the elements below must match that of their
+ *    indices as defined in auxFieldIndex
+ * 2. aux name can contain characters that pass the isValidAuxChar check only */
+auxFieldHandler auxFieldHandlers[] = {
+    {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent},
+    {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent},
+    {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent},
+    {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent},
+};
+
+int auxShardIdSetter(clusterNode *n, void *value, int length) {
+    if (verifyClusterNodeId(value, length) == C_ERR) {
+        return C_ERR;
+    }
+    memcpy(n->shard_id, value, CLUSTER_NAMELEN);
+    /* if n already has replicas, make sure they all agree
+     * on the shard id */
+    for (int i = 0; i < n->numslaves; i++) {
+        if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
+            return C_ERR;
+        }
+    }
+    clusterAddNodeToShard(value, n);
+    return C_OK;
+}
+
+sds auxShardIdGetter(clusterNode *n, sds s) {
+    return sdscatprintf(s, "%.40s", n->shard_id);
+}
+
+int auxShardIdPresent(clusterNode *n) {
+    return strlen(n->shard_id);
+}
+
+int auxHumanNodenameSetter(clusterNode *n, void *value, int length) {
+    if (n && !strncmp(value, n->human_nodename, length)) {
+        return C_OK;
+    } else if (!n && (length == 0)) {
+        return C_OK;
+    }
+    if (n) {
+        n->human_nodename = sdscpylen(n->human_nodename, value, length);
+    } else if (sdslen(n->human_nodename) != 0) {
+        sdsclear(n->human_nodename);
+    } else {
+        return C_ERR;
+    }
+    return C_OK;
+}
+
+sds auxHumanNodenameGetter(clusterNode *n, sds s) {
+    return sdscatprintf(s, "%s", n->human_nodename);
+}
+
+int auxHumanNodenamePresent(clusterNode *n) {
+    return sdslen(n->human_nodename);
+}
+
+int auxTcpPortSetter(clusterNode *n, void *value, int length) {
+    if (length > 5 || length < 1) {
+        return C_ERR;
+    }
+    char buf[length + 1];
+    memcpy(buf, (char*)value, length);
+    buf[length] = '\0';
+    n->tcp_port = atoi(buf);
+    return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK;
+}
+
+sds auxTcpPortGetter(clusterNode *n, sds s) {
+    return sdscatprintf(s, "%d", n->tcp_port);
+}
+
+int auxTcpPortPresent(clusterNode *n) {
+    return n->tcp_port >= 0 && n->tcp_port < 65536;
+}
+
+int auxTlsPortSetter(clusterNode *n, void *value, int length) {
+    if (length > 5 || length < 1) {
+        return C_ERR;
+    }
+    char buf[length + 1];
+    memcpy(buf, (char*)value, length);
+    buf[length] = '\0';
+    n->tls_port = atoi(buf);
+    return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK;
+}
+
+sds auxTlsPortGetter(clusterNode *n, sds s) {
+    return sdscatprintf(s, "%d", n->tls_port);
+}
+
+int auxTlsPortPresent(clusterNode *n) {
+    return n->tls_port >= 0 && n->tls_port < 65536;
+}
+
+/* clusterLink send queue blocks */
+typedef struct {
+    size_t totlen; /* Total length of this block including the message */
+    int refcount;  /* Number of cluster link send msg queues containing the message */
+    clusterMsg msg;
+} clusterMsgSendBlock;
+
+/* -----------------------------------------------------------------------------
+ * Initialization
+ * -------------------------------------------------------------------------- */
+
+/* Load the cluster config from 'filename'.
+ *
+ * If the file does not exist or is zero-length (this may happen because
+ * when we lock the nodes.conf file, we create a zero-length one for the
+ * sake of locking if it does not already exist), C_ERR is returned.
+ * If the configuration was loaded from the file, C_OK is returned. */
+int clusterLoadConfig(char *filename) {
+    FILE *fp = fopen(filename,"r");
+    struct stat sb;
+    char *line;
+    int maxline, j;
+
+    if (fp == NULL) {
+        if (errno == ENOENT) {
+            return C_ERR;
+        } else {
+            serverLog(LL_WARNING,
+                "Loading the cluster node config from %s: %s",
+                filename, strerror(errno));
+            exit(1);
+        }
+    }
+
+    if (redis_fstat(fileno(fp),&sb) == -1) {
+        serverLog(LL_WARNING,
+            "Unable to obtain the cluster node config file stat %s: %s",
+            filename, strerror(errno));
+        exit(1);
+    }
+    /* Check if the file is zero-length: if so return C_ERR to signal
+     * we have to write the config. */
+    if (sb.st_size == 0) {
+        fclose(fp);
+        return C_ERR;
+    }
+
+    /* Parse the file. Note that single lines of the cluster config file can
+     * be really long as they include all the hash slots of the node.
+     * This means in the worst possible case, half of the Redis slots will be
+     * present in a single line, possibly in importing or migrating state, so
+     * together with the node ID of the sender/receiver.
+     *
+     * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */
+    maxline = 1024+CLUSTER_SLOTS*128;
+    line = zmalloc(maxline);
+    while(fgets(line,maxline,fp) != NULL) {
+        int argc, aux_argc;
+        sds *argv, *aux_argv;
+        clusterNode *n, *master;
+        char *p, *s;
+
+        /* Skip blank lines, they can be created either by users manually
+         * editing nodes.conf or by the config writing process if stopped
+         * before the truncate() call. */
+        if (line[0] == '\n' || line[0] == '\0') continue;
+
+        /* Split the line into arguments for processing. */
+        argv = sdssplitargs(line,&argc);
+        if (argv == NULL) goto fmterr;
+
+        /* Handle the special "vars" line. Don't pretend it is the last
+         * line even if it actually is when generated by Redis. */
+        if (strcasecmp(argv[0],"vars") == 0) {
+            if (!(argc % 2)) goto fmterr;
+            for (j = 1; j < argc; j += 2) {
+                if (strcasecmp(argv[j],"currentEpoch") == 0) {
+                    server.cluster->currentEpoch =
+                            strtoull(argv[j+1],NULL,10);
+                } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) {
+                    server.cluster->lastVoteEpoch =
+                            strtoull(argv[j+1],NULL,10);
+                } else {
+                    serverLog(LL_NOTICE,
+                        "Skipping unknown cluster config variable '%s'",
+                        argv[j]);
+                }
+            }
+            sdsfreesplitres(argv,argc);
+            continue;
+        }
+
+        /* Regular config lines have at least eight fields */
+        if (argc < 8) {
+            sdsfreesplitres(argv,argc);
+            goto fmterr;
+        }
+
+        /* Create this node if it does not exist */
+        if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) {
+            sdsfreesplitres(argv, argc);
+            goto fmterr;
+        }
+        n = clusterLookupNode(argv[0], sdslen(argv[0]));
+        if (!n) {
+            n = createClusterNode(argv[0],0);
+            clusterAddNode(n);
+        }
+        /* Format for the node address and auxiliary argument information:
+         * ip:port[@cport][,hostname][,aux=val]*] */
+
+        aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc);
+        if (aux_argv == NULL) {
+            sdsfreesplitres(argv,argc);
+            goto fmterr;
+        }
+
+        /* Hostname is an optional argument that defines the endpoint
+         * that can be reported to clients instead of IP. */
+        if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) {
+            n->hostname = sdscpy(n->hostname, aux_argv[1]);
+        } else if (sdslen(n->hostname) != 0) {
+            sdsclear(n->hostname);
+        }
+
+        /* All fields after hostname are auxiliary and they take on
+         * the format of "aux=val" where both aux and val can contain
+         * characters that pass the isValidAuxChar check only. The order
+         * of the aux fields is insignificant. */
+        int aux_tcp_port = 0;
+        int aux_tls_port = 0;
+        for (int i = 2; i < aux_argc; i++) {
+            int field_argc;
+            sds *field_argv;
+            field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc);
+            if (field_argv == NULL || field_argc != 2) {
+                /* Invalid aux field format */
+                if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc);
+                sdsfreesplitres(argv,argc);
+                goto fmterr;
+            }
+
+            /* Validate that both aux and value contain valid characters only */
+            for (unsigned j = 0; j < 2; j++) {
+                if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){
+                    /* Invalid aux field format */
+                    sdsfreesplitres(field_argv, field_argc);
+                    sdsfreesplitres(argv,argc);
+                    goto fmterr;
+                }
+            }
+
+            /* Note that we don't expect lots of aux fields in the foreseeable
+             * future so a linear search is completely fine. */
+            int field_found = 0;
+            for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) {
+                if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) ||
+                    memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) {
+                    continue;
+                }
+                field_found = 1;
+                aux_tcp_port |= j == af_tcp_port;
+                aux_tls_port |= j == af_tls_port;
+                if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) {
+                    /* Invalid aux field format */
+                    sdsfreesplitres(field_argv, field_argc);
+                    sdsfreesplitres(argv,argc);
+                    goto fmterr;
+                }
+            }
+
+            if (field_found == 0) {
+                /* Invalid aux field format */
+                sdsfreesplitres(field_argv, field_argc);
+                sdsfreesplitres(argv,argc);
+                goto fmterr;
+            }
+
+            sdsfreesplitres(field_argv, field_argc);
+        }
+        /* Address and port */
+        if ((p = strrchr(aux_argv[0],':')) == NULL) {
+            sdsfreesplitres(aux_argv, aux_argc);
+            sdsfreesplitres(argv,argc);
+            goto fmterr;
+        }
+        *p = '\0';
+        memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1);
+        char *port = p+1;
+        char *busp = strchr(port,'@');
+        if (busp) {
+            *busp = '\0';
+            busp++;
+        }
+        /* If neither TCP or TLS port is found in aux field, it is considered
+         * an old version of nodes.conf file.*/
+        if (!aux_tcp_port && !aux_tls_port) {
+            if (server.tls_cluster) {
+                n->tls_port = atoi(port);
+            } else {
+                n->tcp_port = atoi(port);
+            }
+        } else if (!aux_tcp_port) {
+            n->tcp_port = atoi(port);
+        } else if (!aux_tls_port) {
+            n->tls_port = atoi(port);
+        }
+        /* In older versions of nodes.conf the "@busport" part is missing.
+         * In this case we set it to the default offset of 10000 from the
+         * base port. */
+        n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR);
+
+        /* The plaintext port for client in a TLS cluster (n->pport) is not
+         * stored in nodes.conf. It is received later over the bus protocol. */
+
+        sdsfreesplitres(aux_argv, aux_argc);
+
+        /* Parse flags */
+        p = s = argv[2];
+        while(p) {
+            p = strchr(s,',');
+            if (p) *p = '\0';
+            if (!strcasecmp(s,"myself")) {
+                serverAssert(server.cluster->myself == NULL);
+                myself = server.cluster->myself = n;
+                n->flags |= CLUSTER_NODE_MYSELF;
+            } else if (!strcasecmp(s,"master")) {
+                n->flags |= CLUSTER_NODE_MASTER;
+            } else if (!strcasecmp(s,"slave")) {
+                n->flags |= CLUSTER_NODE_SLAVE;
+            } else if (!strcasecmp(s,"fail?")) {
+                n->flags |= CLUSTER_NODE_PFAIL;
+            } else if (!strcasecmp(s,"fail")) {
+                n->flags |= CLUSTER_NODE_FAIL;
+                n->fail_time = mstime();
+            } else if (!strcasecmp(s,"handshake")) {
+                n->flags |= CLUSTER_NODE_HANDSHAKE;
+            } else if (!strcasecmp(s,"noaddr")) {
+                n->flags |= CLUSTER_NODE_NOADDR;
+            } else if (!strcasecmp(s,"nofailover")) {
+                n->flags |= CLUSTER_NODE_NOFAILOVER;
+            } else if (!strcasecmp(s,"noflags")) {
+                /* nothing to do */
+            } else {
+                serverPanic("Unknown flag in redis cluster config file");
+            }
+            if (p) s = p+1;
+        }
+
+        /* Get master if any. Set the master and populate master's
+         * slave list. */
+        if (argv[3][0] != '-') {
+            if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) {
+                sdsfreesplitres(argv, argc);
+                goto fmterr;
+            }
+            master = clusterLookupNode(argv[3], sdslen(argv[3]));
+            if (!master) {
+                master = createClusterNode(argv[3],0);
+                clusterAddNode(master);
+            }
+            /* shard_id can be absent if we are loading a nodes.conf generated
+             * by an older version of Redis; we should follow the primary's
+             * shard_id in this case */
+            if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) {
+                memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN);
+                clusterAddNodeToShard(master->shard_id, n);
+            } else if (clusterGetNodesInMyShard(master) != NULL &&
+                       memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0)
+            {
+                /* If the primary has been added to a shard, make sure this
+                 * node has the same persisted shard id as the primary. */
+                goto fmterr;
+            }
+            n->slaveof = master;
+            clusterNodeAddSlave(master,n);
+        } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) {
+            /* n is a primary but it does not have a persisted shard_id.
+             * This happens if we are loading a nodes.conf generated by
+             * an older version of Redis. We should manually update the
+             * shard membership in this case */
+            clusterAddNodeToShard(n->shard_id, n);
+        }
+
+        /* Set ping sent / pong received timestamps */
+        if (atoi(argv[4])) n->ping_sent = mstime();
+        if (atoi(argv[5])) n->pong_received = mstime();
+
+        /* Set configEpoch for this node.
+         * If the node is a replica, set its config epoch to 0.
+         * If it's a primary, load the config epoch from the configuration file. */
+        n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10);
+
+        /* Populate hash slots served by this instance. */
+        for (j = 8; j < argc; j++) {
+            int start, stop;
+
+            if (argv[j][0] == '[') {
+                /* Here we handle migrating / importing slots */
+                int slot;
+                char direction;
+                clusterNode *cn;
+
+                p = strchr(argv[j],'-');
+                serverAssert(p != NULL);
+                *p = '\0';
+                direction = p[1]; /* Either '>' or '<' */
+                slot = atoi(argv[j]+1);
+                if (slot < 0 || slot >= CLUSTER_SLOTS) {
+                    sdsfreesplitres(argv,argc);
+                    goto fmterr;
+                }
+                p += 3;
+
+                char *pr = strchr(p, ']');
+                size_t node_len = pr - p;
+                if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) {
+                    sdsfreesplitres(argv, argc);
+                    goto fmterr;
+                }
+                cn = clusterLookupNode(p, CLUSTER_NAMELEN);
+                if (!cn) {
+                    cn = createClusterNode(p,0);
+                    clusterAddNode(cn);
+                }
+                if (direction == '>') {
+                    server.cluster->migrating_slots_to[slot] = cn;
+                } else {
+                    server.cluster->importing_slots_from[slot] = cn;
+                }
+                continue;
+            } else if ((p = strchr(argv[j],'-')) != NULL) {
+                *p = '\0';
+                start = atoi(argv[j]);
+                stop = atoi(p+1);
+            } else {
+                start = stop = atoi(argv[j]);
+            }
+            if (start < 0 || start >= CLUSTER_SLOTS ||
+                stop < 0 || stop >= CLUSTER_SLOTS)
+            {
+                sdsfreesplitres(argv,argc);
+                goto fmterr;
+            }
+            while(start <= stop) clusterAddSlot(n, start++);
+        }
+
+        sdsfreesplitres(argv,argc);
+    }
+    /* Config sanity check */
+    if (server.cluster->myself == NULL) goto fmterr;
+
+    zfree(line);
+    fclose(fp);
+
+    serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name);
+
+    /* Something that should never happen: currentEpoch smaller than
+     * the max epoch found in the nodes configuration. However we handle this
+     * as some form of protection against manual editing of critical files. */
+    if (clusterGetMaxEpoch() > server.cluster->currentEpoch) {
+        server.cluster->currentEpoch = clusterGetMaxEpoch();
+    }
+    return C_OK;
+
+fmterr:
+    serverLog(LL_WARNING,
+        "Unrecoverable error: corrupted cluster config file \"%s\".", line);
+    zfree(line);
+    if (fp) fclose(fp);
+    exit(1);
+}
+
+/* Cluster node configuration is exactly the same as CLUSTER NODES output.
+ *
+ * This function writes the node config and returns 0, on error -1
+ * is returned.
+ *
+ * Note: we need to write the file in an atomic way from the point of view
+ * of the POSIX filesystem semantics, so that if the server is stopped
+ * or crashes during the write, we'll end with either the old file or the
+ * new one. Since we have the full payload to write available we can use
+ * a single write to write the whole file. If the pre-existing file was
+ * bigger we pad our payload with newlines that are anyway ignored and truncate
+ * the file afterward. */
+int clusterSaveConfig(int do_fsync) {
+    sds ci,tmpfilename;
+    size_t content_size,offset = 0;
+    ssize_t written_bytes;
+    int fd = -1;
+    int retval = C_ERR;
+
+    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
+
+    /* Get the nodes description and concatenate our "vars" directive to
+     * save currentEpoch and lastVoteEpoch. */
+    ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0);
+    ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n",
+        (unsigned long long) server.cluster->currentEpoch,
+        (unsigned long long) server.cluster->lastVoteEpoch);
+    content_size = sdslen(ci);
+
+    /* Create a temp file with the new content. */
+    tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I",
+        server.cluster_configfile,(int) getpid(),mstime());
+    if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) {
+        serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno));
+        goto cleanup;
+    }
+
+    while (offset < content_size) {
+        written_bytes = write(fd,ci + offset,content_size - offset);
+        if (written_bytes <= 0) {
+            if (errno == EINTR) continue;
+            serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s",
+                offset,strerror(errno));
+            goto cleanup;
+        }
+        offset += written_bytes;
+    }
+
+    if (do_fsync) {
+        server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
+        if (redis_fsync(fd) == -1) {
+            serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno));
+            goto cleanup;
+        }
+    }
+
+    if (rename(tmpfilename, server.cluster_configfile) == -1) {
+        serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno));
+        goto cleanup;
+    }
+
+    if (do_fsync) {
+        if (fsyncFileDir(server.cluster_configfile) == -1) {
+            serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno));
+            goto cleanup;
+        }
+    }
+    retval = C_OK; /* If we reached this point, everything is fine. */
+
+cleanup:
+    if (fd != -1) close(fd);
+    if (retval) unlink(tmpfilename);
+    sdsfree(tmpfilename);
+    sdsfree(ci);
+    return retval;
+}
+
+void clusterSaveConfigOrDie(int do_fsync) {
+    if (clusterSaveConfig(do_fsync) == -1) {
+        serverLog(LL_WARNING,"Fatal: can't update cluster config file.");
+        exit(1);
+    }
+}
+
+/* Lock the cluster config using flock(), and retain the file descriptor used to
+ * acquire the lock so that the file will be locked as long as the process is up.
+ *
+ * This works because we always update nodes.conf with a new version
+ * in-place, reopening the file, and writing to it in place (later adjusting
+ * the length with ftruncate()).
+ *
+ * On success C_OK is returned, otherwise an error is logged and
+ * the function returns C_ERR to signal a lock was not acquired. */
+int clusterLockConfig(char *filename) {
+/* flock() does not exist on Solaris
+ * and a fcntl-based solution won't help, as we constantly re-open that file,
+ * which will release _all_ locks anyway
+ */
+#if !defined(__sun)
+    /* To lock it, we need to open the file in a way it is created if
+     * it does not exist, otherwise there is a race condition with other
+     * processes. */
+    int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644);
+    if (fd == -1) {
+        serverLog(LL_WARNING,
+            "Can't open %s in order to acquire a lock: %s",
+            filename, strerror(errno));
+        return C_ERR;
+    }
+
+    if (flock(fd,LOCK_EX|LOCK_NB) == -1) {
+        if (errno == EWOULDBLOCK) {
+            serverLog(LL_WARNING,
+                 "Sorry, the cluster configuration file %s is already used "
+                 "by a different Redis Cluster node. Please make sure that "
+                 "different nodes use different cluster configuration "
+                 "files.", filename);
+        } else {
+            serverLog(LL_WARNING,
+                "Impossible to lock %s: %s", filename, strerror(errno));
+        }
+        close(fd);
+        return C_ERR;
+    }
+    /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that
+     * we'll retain the lock to the file as long as the process exists.
+     *
+     * After fork, the child process will get the fd opened by the parent process,
+     * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
+     * it will be closed in the child process.
+     * If it is not closed, when the main process is killed -9, but the child process
+     * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
+     * child process, and the main process will fail to get lock, means fail to start. */
+    server.cluster_config_file_lock_fd = fd;
+#else
+    UNUSED(filename);
+#endif /* __sun */
+
+    return C_OK;
+}
+
+/* Derives our ports to be announced in the cluster bus. */
+void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port,
+                          int *announced_cport) {
+    /* Config overriding announced ports. */
+    *announced_tcp_port = server.cluster_announce_port ? 
+                          server.cluster_announce_port : server.port;
+    *announced_tls_port = server.cluster_announce_tls_port ? 
+                          server.cluster_announce_tls_port : server.tls_port;
+    /* Derive cluster bus port. */
+    if (server.cluster_announce_bus_port) {
+        *announced_cport = server.cluster_announce_bus_port;
+    } else if (server.cluster_port) {
+        *announced_cport = server.cluster_port;
+    } else {
+        *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR;
+    }
+}
+
+/* Some flags (currently just the NOFAILOVER flag) may need to be updated
+ * in the "myself" node based on the current configuration of the node,
+ * that may change at runtime via CONFIG SET. This function changes the
+ * set of flags in myself->flags accordingly. */
+void clusterUpdateMyselfFlags(void) {
+    if (!myself) return;
+    int oldflags = myself->flags;
+    int nofailover = server.cluster_slave_no_failover ?
+                     CLUSTER_NODE_NOFAILOVER : 0;
+    myself->flags &= ~CLUSTER_NODE_NOFAILOVER;
+    myself->flags |= nofailover;
+    if (myself->flags != oldflags) {
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_UPDATE_STATE);
+    }
+}
+
+
+/* We want to take myself->port/cport/pport in sync with the
+* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option.
+* The option can be set at runtime via CONFIG SET. */
+void clusterUpdateMyselfAnnouncedPorts(void) {
+    if (!myself) return;
+    deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport);
+}
+
+/* We want to take myself->ip in sync with the cluster-announce-ip option.
+* The option can be set at runtime via CONFIG SET. */
+void clusterUpdateMyselfIp(void) {
+    if (!myself) return;
+    static char *prev_ip = NULL;
+    char *curr_ip = server.cluster_announce_ip;
+    int changed = 0;
+
+    if (prev_ip == NULL && curr_ip != NULL) changed = 1;
+    else if (prev_ip != NULL && curr_ip == NULL) changed = 1;
+    else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1;
+
+    if (changed) {
+        if (prev_ip) zfree(prev_ip);
+        prev_ip = curr_ip;
+
+        if (curr_ip) {
+            /* We always take a copy of the previous IP address, by
+            * duplicating the string. This way later we can check if
+            * the address really changed. */
+            prev_ip = zstrdup(prev_ip);
+            redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN);
+        } else {
+            myself->ip[0] = '\0'; /* Force autodetection. */
+        }
+    }
+}
+
+/* Update the hostname for the specified node with the provided C string. */
+static void updateAnnouncedHostname(clusterNode *node, char *new) {
+    /* Previous and new hostname are the same, no need to update. */
+    if (new && !strcmp(new, node->hostname)) {
+        return;
+    } else if (!new && (sdslen(node->hostname) == 0)) {
+        return;
+    }
+
+    if (new) {
+        node->hostname = sdscpy(node->hostname, new);
+    } else if (sdslen(node->hostname) != 0) {
+        sdsclear(node->hostname);
+    }
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+}
+
+static void updateAnnouncedHumanNodename(clusterNode *node, char *new) {
+    if (new && !strcmp(new, node->human_nodename)) {
+        return;
+    } else if (!new && (sdslen(node->human_nodename) == 0)) {
+        return;
+    }
+    
+    if (new) {
+        node->human_nodename = sdscpy(node->human_nodename, new);
+    } else if (sdslen(node->human_nodename) != 0) {
+        sdsclear(node->human_nodename);
+    }
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+}
+
+
+static void updateShardId(clusterNode *node, const char *shard_id) {
+    if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) {
+        clusterRemoveNodeFromShard(node);
+        memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN);
+        clusterAddNodeToShard(shard_id, node);
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+    }
+    if (shard_id && myself != node && myself->slaveof == node) {
+        if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) {
+            /* shard-id can diverge right after a rolling upgrade
+             * from pre-7.2 releases */
+            clusterRemoveNodeFromShard(myself);
+            memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN);
+            clusterAddNodeToShard(shard_id, myself);
+            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG);
+        }
+    }
+}
+
+/* Update my hostname based on server configuration values */
+void clusterUpdateMyselfHostname(void) {
+    if (!myself) return;
+    updateAnnouncedHostname(myself, server.cluster_announce_hostname);
+}
+
+void clusterUpdateMyselfHumanNodename(void) {
+    if (!myself) return;
+    updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename);
+}
+
+void clusterInit(void) {
+    int saveconf = 0;
+
+    server.cluster = zmalloc(sizeof(struct clusterState));
+    server.cluster->myself = NULL;
+    server.cluster->currentEpoch = 0;
+    server.cluster->state = CLUSTER_FAIL;
+    server.cluster->size = 1;
+    server.cluster->todo_before_sleep = 0;
+    server.cluster->nodes = dictCreate(&clusterNodesDictType);
+    server.cluster->shards = dictCreate(&clusterSdsToListType);
+    server.cluster->nodes_black_list =
+        dictCreate(&clusterNodesBlackListDictType);
+    server.cluster->failover_auth_time = 0;
+    server.cluster->failover_auth_count = 0;
+    server.cluster->failover_auth_rank = 0;
+    server.cluster->failover_auth_epoch = 0;
+    server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
+    server.cluster->lastVoteEpoch = 0;
+
+    /* Initialize stats */
+    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
+        server.cluster->stats_bus_messages_sent[i] = 0;
+        server.cluster->stats_bus_messages_received[i] = 0;
+    }
+    server.cluster->stats_pfail_nodes = 0;
+    server.cluster->stat_cluster_links_buffer_limit_exceeded = 0;
+
+    memset(server.cluster->slots,0, sizeof(server.cluster->slots));
+    clusterCloseAllSlots();
+
+    memset(server.cluster->owner_not_claiming_slot, 0, sizeof(server.cluster->owner_not_claiming_slot));
+
+    /* Lock the cluster config file to make sure every node uses
+     * its own nodes.conf. */
+    server.cluster_config_file_lock_fd = -1;
+    if (clusterLockConfig(server.cluster_configfile) == C_ERR)
+        exit(1);
+
+    /* Load or create a new nodes configuration. */
+    if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
+        /* No configuration found. We will just use the random name provided
+         * by the createClusterNode() function. */
+        myself = server.cluster->myself =
+            createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER);
+        serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
+            myself->name);
+        clusterAddNode(myself);
+        clusterAddNodeToShard(myself->shard_id, myself);
+        saveconf = 1;
+    }
+    if (saveconf) clusterSaveConfigOrDie(1);
+
+    /* Port sanity check II
+     * The other handshake port check is triggered too late to stop
+     * us from trying to use a too-high cluster port number. */
+    int port = defaultClientPort();
+    if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) {
+        serverLog(LL_WARNING, "Redis port number too high. "
+                   "Cluster communication port is 10,000 port "
+                   "numbers higher than your Redis port. "
+                   "Your Redis port number must be 55535 or less.");
+        exit(1);
+    }
+    if (!server.bindaddr_count) {
+        serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus.");
+        exit(1);
+    }
+
+    /* The slots -> channels map is a radix tree. Initialize it here. */
+    server.cluster->slots_to_channels = raxNew();
+
+    /* Set myself->port/cport/pport to my listening ports, we'll just need to
+     * discover the IP address via MEET messages. */
+    deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport);
+
+    server.cluster->mf_end = 0;
+    server.cluster->mf_slave = NULL;
+    resetManualFailover();
+    clusterUpdateMyselfFlags();
+    clusterUpdateMyselfIp();
+    clusterUpdateMyselfHostname();
+    clusterUpdateMyselfHumanNodename();
+}
+
+void clusterInitLast(void) {
+    if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) {
+        serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL));
+        exit(1);
+    }
+
+    int port = defaultClientPort();
+    connListener *listener = &server.clistener;
+    listener->count = 0;
+    listener->bindaddr = server.bindaddr;
+    listener->bindaddr_count = server.bindaddr_count;
+    listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR;
+    listener->ct = connTypeOfCluster();
+    if (connListen(listener) == C_ERR ) {
+        /* Note: the following log text is matched by the test suite. */
+        serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port);
+        exit(1);
+    }
+    
+    if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) {
+        serverPanic("Unrecoverable error creating Redis Cluster socket accept handler.");
+    }
+}
+
+/* Reset a node performing a soft or hard reset:
+ *
+ * 1) All other nodes are forgotten.
+ * 2) All the assigned / open slots are released.
+ * 3) If the node is a slave, it turns into a master.
+ * 4) Only for hard reset: a new Node ID is generated.
+ * 5) Only for hard reset: currentEpoch and configEpoch are set to 0.
+ * 6) The new configuration is saved and the cluster state updated.
+ * 7) If the node was a slave, the whole data set is flushed away. */
+void clusterReset(int hard) {
+    dictIterator *di;
+    dictEntry *de;
+    int j;
+
+    /* Turn into master. */
+    if (nodeIsSlave(myself)) {
+        clusterSetNodeAsMaster(myself);
+        replicationUnsetMaster();
+        emptyData(-1,EMPTYDB_NO_FLAGS,NULL);
+    }
+
+    /* Close slots, reset manual failover state. */
+    clusterCloseAllSlots();
+    resetManualFailover();
+
+    /* Unassign all the slots. */
+    for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j);
+
+    /* Recreate shards dict */
+    dictEmpty(server.cluster->shards, NULL);
+
+    /* Forget all the nodes, but myself. */
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (node == myself) continue;
+        clusterDelNode(node);
+    }
+    dictReleaseIterator(di);
+
+    /* Empty the nodes blacklist. */
+    dictEmpty(server.cluster->nodes_black_list, NULL);
+
+    /* Hard reset only: set epochs to 0, change node ID. */
+    if (hard) {
+        sds oldname;
+
+        server.cluster->currentEpoch = 0;
+        server.cluster->lastVoteEpoch = 0;
+        myself->configEpoch = 0;
+        serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD");
+
+        /* To change the Node ID we need to remove the old name from the
+         * nodes table, change the ID, and re-add back with new name. */
+        oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN);
+        dictDelete(server.cluster->nodes,oldname);
+        sdsfree(oldname);
+        getRandomHexChars(myself->name, CLUSTER_NAMELEN);
+        getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN);
+        clusterAddNode(myself);
+        serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name);
+    }
+
+    /* Re-populate shards */
+    clusterAddNodeToShard(myself->shard_id, myself);
+
+    /* Make sure to persist the new config and update the state. */
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                         CLUSTER_TODO_UPDATE_STATE|
+                         CLUSTER_TODO_FSYNC_CONFIG);
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER communication link
+ * -------------------------------------------------------------------------- */
+static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) {
+    uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg);
+    clusterMsgSendBlock *msgblock = zcalloc(blocklen);
+    msgblock->refcount = 1;
+    msgblock->totlen = blocklen;
+    server.stat_cluster_links_memory += blocklen;
+    clusterBuildMessageHdr(&msgblock->msg,type,msglen);
+    return msgblock;
+}
+
+static void clusterMsgSendBlockDecrRefCount(void *node) {
+    clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node;
+    msgblock->refcount--;
+    serverAssert(msgblock->refcount >= 0);
+    if (msgblock->refcount == 0) {
+        server.stat_cluster_links_memory -= msgblock->totlen;
+        zfree(msgblock);
+    }
+}
+
+clusterLink *createClusterLink(clusterNode *node) {
+    clusterLink *link = zmalloc(sizeof(*link));
+    link->ctime = mstime();
+    link->send_msg_queue = listCreate();
+    listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount);
+    link->head_msg_send_offset = 0;
+    link->send_msg_queue_mem = sizeof(list);
+    link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
+    link->rcvbuf_len = 0;
+    server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem;
+    link->conn = NULL;
+    link->node = node;
+    /* Related node can only possibly be known at link creation time if this is an outbound link */
+    link->inbound = (node == NULL);
+    if (!link->inbound) {
+        node->link = link;
+    }
+    return link;
+}
+
+/* Free a cluster link, but does not free the associated node of course.
+ * This function will just make sure that the original node associated
+ * with this link will have the 'link' field set to NULL. */
+void freeClusterLink(clusterLink *link) {
+    if (link->conn) {
+        connClose(link->conn);
+        link->conn = NULL;
+    }
+    server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode);
+    listRelease(link->send_msg_queue);
+    server.stat_cluster_links_memory -= link->rcvbuf_alloc;
+    zfree(link->rcvbuf);
+    if (link->node) {
+        if (link->node->link == link) {
+            serverAssert(!link->inbound);
+            link->node->link = NULL;
+        } else if (link->node->inbound_link == link) {
+            serverAssert(link->inbound);
+            link->node->inbound_link = NULL;
+        }
+    }
+    zfree(link);
+}
+
+void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) {
+    serverAssert(!link->node);
+    serverAssert(link->inbound);
+    if (node->inbound_link) {
+        /* A peer may disconnect and then reconnect with us, and it's not guaranteed that
+         * we would always process the disconnection of the existing inbound link before
+         * accepting a new existing inbound link. Therefore, it's possible to have more than
+         * one inbound link from the same node at the same time. Our cleanup logic assumes
+         * a one to one relationship between nodes and inbound links, so we need to kill
+         * one of the links. The existing link is more likely the outdated one, but it's
+         * possible the other node may need to open another link. */
+        serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d",
+                node->inbound_link->conn->fd, node->name, link->conn->fd);
+        freeClusterLink(node->inbound_link);
+    }
+    serverAssert(!node->inbound_link);
+    node->inbound_link = link;
+    link->node = node;
+}
+
+static void clusterConnAcceptHandler(connection *conn) {
+    clusterLink *link;
+
+    if (connGetState(conn) != CONN_STATE_CONNECTED) {
+        serverLog(LL_VERBOSE,
+                "Error accepting cluster node connection: %s", connGetLastError(conn));
+        connClose(conn);
+        return;
+    }
+
+    /* Create a link object we use to handle the connection.
+     * It gets passed to the readable handler when data is available.
+     * Initially the link->node pointer is set to NULL as we don't know
+     * which node is, but the right node is references once we know the
+     * node identity. */
+    link = createClusterLink(NULL);
+    link->conn = conn;
+    connSetPrivateData(conn, link);
+
+    /* Register read handler */
+    connSetReadHandler(conn, clusterReadHandler);
+}
+
+#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000
+void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
+    int cport, cfd;
+    int max = MAX_CLUSTER_ACCEPTS_PER_CALL;
+    char cip[NET_IP_STR_LEN];
+    int require_auth = TLS_CLIENT_AUTH_YES;
+    UNUSED(el);
+    UNUSED(mask);
+    UNUSED(privdata);
+
+    /* If the server is starting up, don't accept cluster connections:
+     * UPDATE messages may interact with the database content. */
+    if (server.masterhost == NULL && server.loading) return;
+
+    while(max--) {
+        cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
+        if (cfd == ANET_ERR) {
+            if (errno != EWOULDBLOCK)
+                serverLog(LL_VERBOSE,
+                    "Error accepting cluster node: %s", server.neterr);
+            return;
+        }
+
+        connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth);
+
+        /* Make sure connection is not in an error state */
+        if (connGetState(conn) != CONN_STATE_ACCEPTING) {
+            serverLog(LL_VERBOSE,
+                "Error creating an accepting connection for cluster node: %s",
+                    connGetLastError(conn));
+            connClose(conn);
+            return;
+        }
+        connEnableTcpNoDelay(conn);
+        connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2);
+
+        /* Use non-blocking I/O for cluster messages. */
+        serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport);
+
+        /* Accept the connection now.  connAccept() may call our handler directly
+         * or schedule it for later depending on connection implementation.
+         */
+        if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) {
+            if (connGetState(conn) == CONN_STATE_ERROR)
+                serverLog(LL_VERBOSE,
+                        "Error accepting cluster node connection: %s",
+                        connGetLastError(conn));
+            connClose(conn);
+            return;
+        }
+    }
+}
+
+/* Return the approximated number of sockets we are using in order to
+ * take the cluster bus connections. */
+unsigned long getClusterConnectionsCount(void) {
+    /* We decrement the number of nodes by one, since there is the
+     * "myself" node too in the list. Each node uses two file descriptors,
+     * one incoming and one outgoing, thus the multiplication by 2. */
+    return server.cluster_enabled ?
+           ((dictSize(server.cluster->nodes)-1)*2) : 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER node API
+ * -------------------------------------------------------------------------- */
+
+/* Create a new cluster node, with the specified flags.
+ * If "nodename" is NULL this is considered a first handshake and a random
+ * node name is assigned to this node (it will be fixed later when we'll
+ * receive the first pong).
+ *
+ * The node is created and returned to the user, but it is not automatically
+ * added to the nodes hash table. */
+clusterNode *createClusterNode(char *nodename, int flags) {
+    clusterNode *node = zmalloc(sizeof(*node));
+
+    if (nodename)
+        memcpy(node->name, nodename, CLUSTER_NAMELEN);
+    else
+        getRandomHexChars(node->name, CLUSTER_NAMELEN);
+    getRandomHexChars(node->shard_id, CLUSTER_NAMELEN);
+    node->ctime = mstime();
+    node->configEpoch = 0;
+    node->flags = flags;
+    memset(node->slots,0,sizeof(node->slots));
+    node->slot_info_pairs = NULL;
+    node->slot_info_pairs_count = 0;
+    node->numslots = 0;
+    node->numslaves = 0;
+    node->slaves = NULL;
+    node->slaveof = NULL;
+    node->last_in_ping_gossip = 0;
+    node->ping_sent = node->pong_received = 0;
+    node->data_received = 0;
+    node->fail_time = 0;
+    node->link = NULL;
+    node->inbound_link = NULL;
+    memset(node->ip,0,sizeof(node->ip));
+    node->hostname = sdsempty();
+    node->human_nodename = sdsempty();
+    node->tcp_port = 0;
+    node->cport = 0;
+    node->tls_port = 0;
+    node->fail_reports = listCreate();
+    node->voted_time = 0;
+    node->orphaned_time = 0;
+    node->repl_offset_time = 0;
+    node->repl_offset = 0;
+    listSetFreeMethod(node->fail_reports,zfree);
+    return node;
+}
+
+/* This function is called every time we get a failure report from a node.
+ * The side effect is to populate the fail_reports list (or to update
+ * the timestamp of an existing report).
+ *
+ * 'failing' is the node that is in failure state according to the
+ * 'sender' node.
+ *
+ * The function returns 0 if it just updates a timestamp of an existing
+ * failure report from the same sender. 1 is returned if a new failure
+ * report is created. */
+int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
+    list *l = failing->fail_reports;
+    listNode *ln;
+    listIter li;
+    clusterNodeFailReport *fr;
+
+    /* If a failure report from the same sender already exists, just update
+     * the timestamp. */
+    listRewind(l,&li);
+    while ((ln = listNext(&li)) != NULL) {
+        fr = ln->value;
+        if (fr->node == sender) {
+            fr->time = mstime();
+            return 0;
+        }
+    }
+
+    /* Otherwise create a new report. */
+    fr = zmalloc(sizeof(*fr));
+    fr->node = sender;
+    fr->time = mstime();
+    listAddNodeTail(l,fr);
+    return 1;
+}
+
+/* Remove failure reports that are too old, where too old means reasonably
+ * older than the global node timeout. Note that anyway for a node to be
+ * flagged as FAIL we need to have a local PFAIL state that is at least
+ * older than the global node timeout, so we don't just trust the number
+ * of failure reports from other nodes. */
+void clusterNodeCleanupFailureReports(clusterNode *node) {
+    list *l = node->fail_reports;
+    listNode *ln;
+    listIter li;
+    clusterNodeFailReport *fr;
+    mstime_t maxtime = server.cluster_node_timeout *
+                     CLUSTER_FAIL_REPORT_VALIDITY_MULT;
+    mstime_t now = mstime();
+
+    listRewind(l,&li);
+    while ((ln = listNext(&li)) != NULL) {
+        fr = ln->value;
+        if (now - fr->time > maxtime) listDelNode(l,ln);
+    }
+}
+
+/* Remove the failing report for 'node' if it was previously considered
+ * failing by 'sender'. This function is called when a node informs us via
+ * gossip that a node is OK from its point of view (no FAIL or PFAIL flags).
+ *
+ * Note that this function is called relatively often as it gets called even
+ * when there are no nodes failing, and is O(N), however when the cluster is
+ * fine the failure reports list is empty so the function runs in constant
+ * time.
+ *
+ * The function returns 1 if the failure report was found and removed.
+ * Otherwise 0 is returned. */
+int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
+    list *l = node->fail_reports;
+    listNode *ln;
+    listIter li;
+    clusterNodeFailReport *fr;
+
+    /* Search for a failure report from this sender. */
+    listRewind(l,&li);
+    while ((ln = listNext(&li)) != NULL) {
+        fr = ln->value;
+        if (fr->node == sender) break;
+    }
+    if (!ln) return 0; /* No failure report from this sender. */
+
+    /* Remove the failure report. */
+    listDelNode(l,ln);
+    clusterNodeCleanupFailureReports(node);
+    return 1;
+}
+
+/* Return the number of external nodes that believe 'node' is failing,
+ * not including this node, that may have a PFAIL or FAIL state for this
+ * node as well. */
+int clusterNodeFailureReportsCount(clusterNode *node) {
+    clusterNodeCleanupFailureReports(node);
+    return listLength(node->fail_reports);
+}
+
+int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {
+    int j;
+
+    for (j = 0; j < master->numslaves; j++) {
+        if (master->slaves[j] == slave) {
+            if ((j+1) < master->numslaves) {
+                int remaining_slaves = (master->numslaves - j) - 1;
+                memmove(master->slaves+j,master->slaves+(j+1),
+                        (sizeof(*master->slaves) * remaining_slaves));
+            }
+            master->numslaves--;
+            if (master->numslaves == 0)
+                master->flags &= ~CLUSTER_NODE_MIGRATE_TO;
+            return C_OK;
+        }
+    }
+    return C_ERR;
+}
+
+int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) {
+    int j;
+
+    /* If it's already a slave, don't add it again. */
+    for (j = 0; j < master->numslaves; j++)
+        if (master->slaves[j] == slave) return C_ERR;
+    master->slaves = zrealloc(master->slaves,
+        sizeof(clusterNode*)*(master->numslaves+1));
+    master->slaves[master->numslaves] = slave;
+    master->numslaves++;
+    master->flags |= CLUSTER_NODE_MIGRATE_TO;
+    return C_OK;
+}
+
+int clusterCountNonFailingSlaves(clusterNode *n) {
+    int j, okslaves = 0;
+
+    for (j = 0; j < n->numslaves; j++)
+        if (!nodeFailed(n->slaves[j])) okslaves++;
+    return okslaves;
+}
+
+/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
+void freeClusterNode(clusterNode *n) {
+    sds nodename;
+    int j;
+
+    /* If the node has associated slaves, we have to set
+     * all the slaves->slaveof fields to NULL (unknown). */
+    for (j = 0; j < n->numslaves; j++)
+        n->slaves[j]->slaveof = NULL;
+
+    /* Remove this node from the list of slaves of its master. */
+    if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
+
+    /* Unlink from the set of nodes. */
+    nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
+    serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
+    sdsfree(nodename);
+    sdsfree(n->hostname);
+    sdsfree(n->human_nodename);
+
+    /* Release links and associated data structures. */
+    if (n->link) freeClusterLink(n->link);
+    if (n->inbound_link) freeClusterLink(n->inbound_link);
+    listRelease(n->fail_reports);
+    zfree(n->slaves);
+    zfree(n);
+}
+
+/* Add a node to the nodes hash table */
+void clusterAddNode(clusterNode *node) {
+    int retval;
+
+    retval = dictAdd(server.cluster->nodes,
+            sdsnewlen(node->name,CLUSTER_NAMELEN), node);
+    serverAssert(retval == DICT_OK);
+}
+
+/* Remove a node from the cluster. The function performs the high level
+ * cleanup, calling freeClusterNode() for the low level cleanup.
+ * Here we do the following:
+ *
+ * 1) Mark all the slots handled by it as unassigned.
+ * 2) Remove all the failure reports sent by this node and referenced by
+ *    other nodes.
+ * 3) Remove the node from the owning shard
+ * 4) Free the node with freeClusterNode() that will in turn remove it
+ *    from the hash table and from the list of slaves of its master, if
+ *    it is a slave node.
+ */
+void clusterDelNode(clusterNode *delnode) {
+    int j;
+    dictIterator *di;
+    dictEntry *de;
+
+    /* 1) Mark slots as unassigned. */
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (server.cluster->importing_slots_from[j] == delnode)
+            server.cluster->importing_slots_from[j] = NULL;
+        if (server.cluster->migrating_slots_to[j] == delnode)
+            server.cluster->migrating_slots_to[j] = NULL;
+        if (server.cluster->slots[j] == delnode)
+            clusterDelSlot(j);
+    }
+
+    /* 2) Remove failure reports. */
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (node == delnode) continue;
+        clusterNodeDelFailureReport(node,delnode);
+    }
+    dictReleaseIterator(di);
+
+    /* 3) Remove the node from the owning shard */
+    clusterRemoveNodeFromShard(delnode);
+
+    /* 4) Free the node, unlinking it from the cluster. */
+    freeClusterNode(delnode);
+}
+
+/* Node lookup by name */
+clusterNode *clusterLookupNode(const char *name, int length) {
+    if (verifyClusterNodeId(name, length) != C_OK) return NULL;
+    sds s = sdsnewlen(name, length);
+    dictEntry *de = dictFind(server.cluster->nodes, s);
+    sdsfree(s);
+    if (de == NULL) return NULL;
+    return dictGetVal(de);
+}
+
+/* Get all the nodes in my shard.
+ * Note that the list returned is not computed on the fly
+ * via slaveof; rather, it is maintained permanently to
+ * track the shard membership and its life cycle is tied
+ * to this Redis process. Therefore, the caller must not
+ * release the list. */
+list *clusterGetNodesInMyShard(clusterNode *node) {
+    sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(server.cluster->shards,s);
+    sdsfree(s);
+    return (de != NULL) ? dictGetVal(de) : NULL;
+}
+
+/* This is only used after the handshake. When we connect a given IP/PORT
+ * as a result of CLUSTER MEET we don't have the node name yet, so we
+ * pick a random one, and will fix it when we receive the PONG request using
+ * this function. */
+void clusterRenameNode(clusterNode *node, char *newname) {
+    int retval;
+    sds s = sdsnewlen(node->name, CLUSTER_NAMELEN);
+
+    serverLog(LL_DEBUG,"Renaming node %.40s into %.40s",
+        node->name, newname);
+    retval = dictDelete(server.cluster->nodes, s);
+    sdsfree(s);
+    serverAssert(retval == DICT_OK);
+    memcpy(node->name, newname, CLUSTER_NAMELEN);
+    clusterAddNode(node);
+}
+
+void clusterAddNodeToShard(const char *shard_id, clusterNode *node) {
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(server.cluster->shards,s);
+    if (de == NULL) {
+        list *l = listCreate();
+        listAddNodeTail(l, node);
+        serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK);
+    } else {
+        list *l = dictGetVal(de);
+        if (listSearchKey(l, node) == NULL) {
+            listAddNodeTail(l, node);
+        }
+        sdsfree(s);
+    }
+}
+
+void clusterRemoveNodeFromShard(clusterNode *node) {
+    sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(server.cluster->shards, s);
+    if (de != NULL) {
+        list *l = dictGetVal(de);
+        listNode *ln = listSearchKey(l, node);
+        if (ln != NULL) {
+            listDelNode(l, ln);
+        }
+        if (listLength(l) == 0) {
+            dictDelete(server.cluster->shards, s);
+        }
+    }
+    sdsfree(s);
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER config epoch handling
+ * -------------------------------------------------------------------------- */
+
+/* Return the greatest configEpoch found in the cluster, or the current
+ * epoch if greater than any node configEpoch. */
+uint64_t clusterGetMaxEpoch(void) {
+    uint64_t max = 0;
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        if (node->configEpoch > max) max = node->configEpoch;
+    }
+    dictReleaseIterator(di);
+    if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch;
+    return max;
+}
+
+/* If this node epoch is zero or is not already the greatest across the
+ * cluster (from the POV of the local configuration), this function will:
+ *
+ * 1) Generate a new config epoch, incrementing the current epoch.
+ * 2) Assign the new epoch to this node, WITHOUT any consensus.
+ * 3) Persist the configuration on disk before sending packets with the
+ *    new configuration.
+ *
+ * If the new config epoch is generated and assigned, C_OK is returned,
+ * otherwise C_ERR is returned (since the node has already the greatest
+ * configuration around) and no operation is performed.
+ *
+ * Important note: this function violates the principle that config epochs
+ * should be generated with consensus and should be unique across the cluster.
+ * However Redis Cluster uses this auto-generated new config epochs in two
+ * cases:
+ *
+ * 1) When slots are closed after importing. Otherwise resharding would be
+ *    too expensive.
+ * 2) When CLUSTER FAILOVER is called with options that force a slave to
+ *    failover its master even if there is not master majority able to
+ *    create a new configuration epoch.
+ *
+ * Redis Cluster will not explode using this function, even in the case of
+ * a collision between this node and another node, generating the same
+ * configuration epoch unilaterally, because the config epoch conflict
+ * resolution algorithm will eventually move colliding nodes to different
+ * config epochs. However using this function may violate the "last failover
+ * wins" rule, so should only be used with care. */
+int clusterBumpConfigEpochWithoutConsensus(void) {
+    uint64_t maxEpoch = clusterGetMaxEpoch();
+
+    if (myself->configEpoch == 0 ||
+        myself->configEpoch != maxEpoch)
+    {
+        server.cluster->currentEpoch++;
+        myself->configEpoch = server.cluster->currentEpoch;
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_FSYNC_CONFIG);
+        serverLog(LL_NOTICE,
+            "New configEpoch set to %llu",
+            (unsigned long long) myself->configEpoch);
+        return C_OK;
+    } else {
+        return C_ERR;
+    }
+}
+
+/* This function is called when this node is a master, and we receive from
+ * another master a configuration epoch that is equal to our configuration
+ * epoch.
+ *
+ * BACKGROUND
+ *
+ * It is not possible that different slaves get the same config
+ * epoch during a failover election, because the slaves need to get voted
+ * by a majority. However when we perform a manual resharding of the cluster
+ * the node will assign a configuration epoch to itself without to ask
+ * for agreement. Usually resharding happens when the cluster is working well
+ * and is supervised by the sysadmin, however it is possible for a failover
+ * to happen exactly while the node we are resharding a slot to assigns itself
+ * a new configuration epoch, but before it is able to propagate it.
+ *
+ * So technically it is possible in this condition that two nodes end with
+ * the same configuration epoch.
+ *
+ * Another possibility is that there are bugs in the implementation causing
+ * this to happen.
+ *
+ * Moreover when a new cluster is created, all the nodes start with the same
+ * configEpoch. This collision resolution code allows nodes to automatically
+ * end with a different configEpoch at startup automatically.
+ *
+ * In all the cases, we want a mechanism that resolves this issue automatically
+ * as a safeguard. The same configuration epoch for masters serving different
+ * set of slots is not harmful, but it is if the nodes end serving the same
+ * slots for some reason (manual errors or software bugs) without a proper
+ * failover procedure.
+ *
+ * In general we want a system that eventually always ends with different
+ * masters having different configuration epochs whatever happened, since
+ * nothing is worse than a split-brain condition in a distributed system.
+ *
+ * BEHAVIOR
+ *
+ * When this function gets called, what happens is that if this node
+ * has the lexicographically smaller Node ID compared to the other node
+ * with the conflicting epoch (the 'sender' node), it will assign itself
+ * the greatest configuration epoch currently detected among nodes plus 1.
+ *
+ * This means that even if there are multiple nodes colliding, the node
+ * with the greatest Node ID never moves forward, so eventually all the nodes
+ * end with a different configuration epoch.
+ */
+void clusterHandleConfigEpochCollision(clusterNode *sender) {
+    /* Prerequisites: nodes have the same configEpoch and are both masters. */
+    if (sender->configEpoch != myself->configEpoch ||
+        !clusterNodeIsMaster(sender) || !clusterNodeIsMaster(myself)) return;
+    /* Don't act if the colliding node has a smaller Node ID. */
+    if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return;
+    /* Get the next ID available at the best of this node knowledge. */
+    server.cluster->currentEpoch++;
+    myself->configEpoch = server.cluster->currentEpoch;
+    clusterSaveConfigOrDie(1);
+    serverLog(LL_VERBOSE,
+        "WARNING: configEpoch collision with node %.40s (%s)."
+        " configEpoch set to %llu",
+        sender->name,sender->human_nodename,
+        (unsigned long long) myself->configEpoch);
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER nodes blacklist
+ *
+ * The nodes blacklist is just a way to ensure that a given node with a given
+ * Node ID is not re-added before some time elapsed (this time is specified
+ * in seconds in CLUSTER_BLACKLIST_TTL).
+ *
+ * This is useful when we want to remove a node from the cluster completely:
+ * when CLUSTER FORGET is called, it also puts the node into the blacklist so
+ * that even if we receive gossip messages from other nodes that still remember
+ * about the node we want to remove, we don't re-add it before some time.
+ *
+ * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
+ * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes
+ * in the cluster without dealing with the problem of other nodes re-adding
+ * back the node to nodes we already sent the FORGET command to.
+ *
+ * The data structure used is a hash table with an sds string representing
+ * the node ID as key, and the time when it is ok to re-add the node as
+ * value.
+ * -------------------------------------------------------------------------- */
+
+#define CLUSTER_BLACKLIST_TTL 60      /* 1 minute. */
+
+
+/* Before of the addNode() or Exists() operations we always remove expired
+ * entries from the black list. This is an O(N) operation but it is not a
+ * problem since add / exists operations are called very infrequently and
+ * the hash table is supposed to contain very little elements at max.
+ * However without the cleanup during long uptime and with some automated
+ * node add/removal procedures, entries could accumulate. */
+void clusterBlacklistCleanup(void) {
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes_black_list);
+    while((de = dictNext(di)) != NULL) {
+        int64_t expire = dictGetUnsignedIntegerVal(de);
+
+        if (expire < server.unixtime)
+            dictDelete(server.cluster->nodes_black_list,dictGetKey(de));
+    }
+    dictReleaseIterator(di);
+}
+
+/* Cleanup the blacklist and add a new node ID to the black list. */
+void clusterBlacklistAddNode(clusterNode *node) {
+    dictEntry *de;
+    sds id = sdsnewlen(node->name,CLUSTER_NAMELEN);
+
+    clusterBlacklistCleanup();
+    if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) {
+        /* If the key was added, duplicate the sds string representation of
+         * the key for the next lookup. We'll free it at the end. */
+        id = sdsdup(id);
+    }
+    de = dictFind(server.cluster->nodes_black_list,id);
+    dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL);
+    sdsfree(id);
+}
+
+/* Return non-zero if the specified node ID exists in the blacklist.
+ * You don't need to pass an sds string here, any pointer to 40 bytes
+ * will work. */
+int clusterBlacklistExists(char *nodeid) {
+    sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN);
+    int retval;
+
+    clusterBlacklistCleanup();
+    retval = dictFind(server.cluster->nodes_black_list,id) != NULL;
+    sdsfree(id);
+    return retval;
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER messages exchange - PING/PONG and gossip
+ * -------------------------------------------------------------------------- */
+
+/* This function checks if a given node should be marked as FAIL.
+ * It happens if the following conditions are met:
+ *
+ * 1) We received enough failure reports from other master nodes via gossip.
+ *    Enough means that the majority of the masters signaled the node is
+ *    down recently.
+ * 2) We believe this node is in PFAIL state.
+ *
+ * If a failure is detected we also inform the whole cluster about this
+ * event trying to force every other node to set the FAIL flag for the node.
+ *
+ * Note that the form of agreement used here is weak, as we collect the majority
+ * of masters state during some time, and even if we force agreement by
+ * propagating the FAIL message, because of partitions we may not reach every
+ * node. However:
+ *
+ * 1) Either we reach the majority and eventually the FAIL state will propagate
+ *    to all the cluster.
+ * 2) Or there is no majority so no slave promotion will be authorized and the
+ *    FAIL flag will be cleared after some time.
+ */
+void markNodeAsFailingIfNeeded(clusterNode *node) {
+    int failures;
+    int needed_quorum = (server.cluster->size / 2) + 1;
+
+    if (!nodeTimedOut(node)) return; /* We can reach it. */
+    if (nodeFailed(node)) return; /* Already FAILing. */
+
+    failures = clusterNodeFailureReportsCount(node);
+    /* Also count myself as a voter if I'm a master. */
+    if (clusterNodeIsMaster(myself)) failures++;
+    if (failures < needed_quorum) return; /* No weak agreement from masters. */
+
+    serverLog(LL_NOTICE,
+        "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename);
+
+    /* Mark the node as failing. */
+    node->flags &= ~CLUSTER_NODE_PFAIL;
+    node->flags |= CLUSTER_NODE_FAIL;
+    node->fail_time = mstime();
+
+    /* Broadcast the failing node name to everybody, forcing all the other
+     * reachable nodes to flag the node as FAIL.
+     * We do that even if this node is a replica and not a master: anyway
+     * the failing state is triggered collecting failure reports from masters,
+     * so here the replica is only helping propagating this status. */
+    clusterSendFail(node->name);
+    clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+}
+
+/* This function is called only if a node is marked as FAIL, but we are able
+ * to reach it again. It checks if there are the conditions to undo the FAIL
+ * state. */
+void clearNodeFailureIfNeeded(clusterNode *node) {
+    mstime_t now = mstime();
+
+    serverAssert(nodeFailed(node));
+
+    /* For slaves we always clear the FAIL flag if we can contact the
+     * node again. */
+    if (nodeIsSlave(node) || node->numslots == 0) {
+        serverLog(LL_NOTICE,
+            "Clear FAIL state for node %.40s (%s):%s is reachable again.",
+                node->name,node->human_nodename,
+                nodeIsSlave(node) ? "replica" : "master without slots");
+        node->flags &= ~CLUSTER_NODE_FAIL;
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+    }
+
+    /* If it is a master and...
+     * 1) The FAIL state is old enough.
+     * 2) It is yet serving slots from our point of view (not failed over).
+     * Apparently no one is going to fix these slots, clear the FAIL flag. */
+    if (clusterNodeIsMaster(node) && node->numslots > 0 &&
+        (now - node->fail_time) >
+        (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT))
+    {
+        serverLog(LL_NOTICE,
+            "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.",
+                node->name, node->human_nodename);
+        node->flags &= ~CLUSTER_NODE_FAIL;
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+    }
+}
+
+/* Return true if we already have a node in HANDSHAKE state matching the
+ * specified ip address and port number. This function is used in order to
+ * avoid adding a new handshake node for the same address multiple times. */
+int clusterHandshakeInProgress(char *ip, int port, int cport) {
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (!nodeInHandshake(node)) continue;
+        if (!strcasecmp(node->ip,ip) &&
+            getNodeDefaultClientPort(node) == port &&
+            node->cport == cport) break;
+    }
+    dictReleaseIterator(di);
+    return de != NULL;
+}
+
+/* Start a handshake with the specified address if there is not one
+ * already in progress. Returns non-zero if the handshake was actually
+ * started. On error zero is returned and errno is set to one of the
+ * following values:
+ *
+ * EAGAIN - There is already a handshake in progress for this address.
+ * EINVAL - IP or port are not valid. */
+int clusterStartHandshake(char *ip, int port, int cport) {
+    clusterNode *n;
+    char norm_ip[NET_IP_STR_LEN];
+    struct sockaddr_storage sa;
+
+    /* IP sanity check */
+    if (inet_pton(AF_INET,ip,
+            &(((struct sockaddr_in *)&sa)->sin_addr)))
+    {
+        sa.ss_family = AF_INET;
+    } else if (inet_pton(AF_INET6,ip,
+            &(((struct sockaddr_in6 *)&sa)->sin6_addr)))
+    {
+        sa.ss_family = AF_INET6;
+    } else {
+        errno = EINVAL;
+        return 0;
+    }
+
+    /* Port sanity check */
+    if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) {
+        errno = EINVAL;
+        return 0;
+    }
+
+    /* Set norm_ip as the normalized string representation of the node
+     * IP address. */
+    memset(norm_ip,0,NET_IP_STR_LEN);
+    if (sa.ss_family == AF_INET)
+        inet_ntop(AF_INET,
+            (void*)&(((struct sockaddr_in *)&sa)->sin_addr),
+            norm_ip,NET_IP_STR_LEN);
+    else
+        inet_ntop(AF_INET6,
+            (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr),
+            norm_ip,NET_IP_STR_LEN);
+
+    if (clusterHandshakeInProgress(norm_ip,port,cport)) {
+        errno = EAGAIN;
+        return 0;
+    }
+
+    /* Add the node with a random address (NULL as first argument to
+     * createClusterNode()). Everything will be fixed during the
+     * handshake. */
+    n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET);
+    memcpy(n->ip,norm_ip,sizeof(n->ip));
+    if (server.tls_cluster) {
+        n->tls_port = port;
+    } else {
+        n->tcp_port = port;
+    }
+    n->cport = cport;
+    clusterAddNode(n);
+    return 1;
+}
+
+static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) {
+    if (server.tls_cluster) {
+        *tls_port = ntohs(hdr->port);
+        *tcp_port = ntohs(hdr->pport);
+    } else {
+        *tls_port = ntohs(hdr->pport);
+        *tcp_port = ntohs(hdr->port);
+    }
+}
+
+static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) {
+    if (server.tls_cluster) {
+        *tls_port = ntohs(g->port);
+        *tcp_port = ntohs(g->pport);
+    } else {
+        *tls_port = ntohs(g->pport);
+        *tcp_port = ntohs(g->port);
+    }
+}
+
+/* Process the gossip section of PING or PONG packets.
+ * Note that this function assumes that the packet is already sanity-checked
+ * by the caller, not in the content of the gossip section, but in the
+ * length. */
+void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
+    uint16_t count = ntohs(hdr->count);
+    clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip;
+    clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
+
+    while(count--) {
+        uint16_t flags = ntohs(g->flags);
+        clusterNode *node;
+        sds ci;
+
+        if (server.verbosity == LL_DEBUG) {
+            ci = representClusterNodeFlags(sdsempty(), flags);
+            serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s",
+                g->nodename,
+                g->ip,
+                ntohs(g->port),
+                ntohs(g->cport),
+                ci);
+            sdsfree(ci);
+        }
+
+        /* Convert port and pport into TCP port and TLS port. */
+        int msg_tls_port, msg_tcp_port;
+        getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port);
+
+        /* Update our state accordingly to the gossip sections */
+        node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN);
+        if (node) {
+            /* We already know this node.
+               Handle failure reports, only when the sender is a master. */
+            if (sender && clusterNodeIsMaster(sender) && node != myself) {
+                if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
+                    if (clusterNodeAddFailureReport(node,sender)) {
+                        serverLog(LL_VERBOSE,
+                            "Node %.40s (%s) reported node %.40s (%s) as not reachable.",
+                            sender->name, sender->human_nodename, node->name, node->human_nodename);
+                    }
+                    markNodeAsFailingIfNeeded(node);
+                } else {
+                    if (clusterNodeDelFailureReport(node,sender)) {
+                        serverLog(LL_VERBOSE,
+                            "Node %.40s (%s) reported node %.40s (%s) is back online.",
+                            sender->name, sender->human_nodename, node->name, node->human_nodename);
+                    }
+                }
+            }
+
+            /* If from our POV the node is up (no failure flags are set),
+             * we have no pending ping for the node, nor we have failure
+             * reports for this node, update the last pong time with the
+             * one we see from the other nodes. */
+            if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) &&
+                node->ping_sent == 0 &&
+                clusterNodeFailureReportsCount(node) == 0)
+            {
+                mstime_t pongtime = ntohl(g->pong_received);
+                pongtime *= 1000; /* Convert back to milliseconds. */
+
+                /* Replace the pong time with the received one only if
+                 * it's greater than our view but is not in the future
+                 * (with 500 milliseconds tolerance) from the POV of our
+                 * clock. */
+                if (pongtime <= (server.mstime+500) &&
+                    pongtime > node->pong_received)
+                {
+                    node->pong_received = pongtime;
+                }
+            }
+
+            /* If we already know this node, but it is not reachable, and
+             * we see a different address in the gossip section of a node that
+             * can talk with this other node, update the address, disconnect
+             * the old link if any, so that we'll attempt to connect with the
+             * new address. */
+            if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) &&
+                !(flags & CLUSTER_NODE_NOADDR) &&
+                !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) &&
+                (strcasecmp(node->ip,g->ip) ||
+                 node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) ||
+                 node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) ||
+                 node->cport != ntohs(g->cport)))
+            {
+                if (node->link) freeClusterLink(node->link);
+                memcpy(node->ip,g->ip,NET_IP_STR_LEN);
+                node->tcp_port = msg_tcp_port;
+                node->tls_port = msg_tls_port;
+                node->cport = ntohs(g->cport);
+                node->flags &= ~CLUSTER_NODE_NOADDR;
+            }
+        } else {
+            /* If it's not in NOADDR state and we don't have it, we
+             * add it to our trusted dict with exact nodeid and flag.
+             * Note that we cannot simply start a handshake against
+             * this IP/PORT pairs, since IP/PORT can be reused already,
+             * otherwise we risk joining another cluster.
+             *
+             * Note that we require that the sender of this gossip message
+             * is a well known node in our cluster, otherwise we risk
+             * joining another cluster. */
+            if (sender &&
+                !(flags & CLUSTER_NODE_NOADDR) &&
+                !clusterBlacklistExists(g->nodename))
+            {
+                clusterNode *node;
+                node = createClusterNode(g->nodename, flags);
+                memcpy(node->ip,g->ip,NET_IP_STR_LEN);
+                node->tcp_port = msg_tcp_port;
+                node->tls_port = msg_tls_port;
+                node->cport = ntohs(g->cport);
+                clusterAddNode(node);
+            }
+        }
+
+        /* Next node */
+        g++;
+    }
+}
+
+/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes.
+ * If 'announced_ip' length is non-zero, it is used instead of extracting
+ * the IP from the socket peer address. */
+int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) {
+    if (announced_ip[0] != '\0') {
+        memcpy(buf,announced_ip,NET_IP_STR_LEN);
+        buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */
+        return C_OK;
+    } else {
+        if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) {
+            serverLog(LL_NOTICE, "Error converting peer IP to string: %s",
+                link->conn ? connGetLastError(link->conn) : "no link");
+            return C_ERR;
+        }
+        return C_OK;
+    }
+}
+
+/* Update the node address to the IP address that can be extracted
+ * from link->fd, or if hdr->myip is non empty, to the address the node
+ * is announcing us. The port is taken from the packet header as well.
+ *
+ * If the address or port changed, disconnect the node link so that we'll
+ * connect again to the new address.
+ *
+ * If the ip/port pair are already correct no operation is performed at
+ * all.
+ *
+ * The function returns 0 if the node address is still the same,
+ * otherwise 1 is returned. */
+int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link,
+                              clusterMsg *hdr)
+{
+    char ip[NET_IP_STR_LEN] = {0};
+    int cport = ntohs(hdr->cport);
+    int tcp_port, tls_port;
+    getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port);
+
+    /* We don't proceed if the link is the same as the sender link, as this
+     * function is designed to see if the node link is consistent with the
+     * symmetric link that is used to receive PINGs from the node.
+     *
+     * As a side effect this function never frees the passed 'link', so
+     * it is safe to call during packet processing. */
+    if (link == node->link) return 0;
+
+    /* If the peer IP is unavailable for some reasons like invalid fd or closed
+     * link, just give up the update this time, and the update will be retried
+     * in the next round of PINGs */
+    if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0;
+
+    if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port &&
+        strcmp(ip,node->ip) == 0) return 0;
+
+    /* IP / port is different, update it. */
+    memcpy(node->ip,ip,sizeof(ip));
+    node->tcp_port = tcp_port;
+    node->tls_port = tls_port;
+    node->cport = cport;
+    if (node->link) freeClusterLink(node->link);
+    node->flags &= ~CLUSTER_NODE_NOADDR;
+    serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d",
+        node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); 
+
+    /* Check if this is our master and we have to change the
+     * replication target as well. */
+    if (nodeIsSlave(myself) && myself->slaveof == node)
+        replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node));
+    return 1;
+}
+
+/* Reconfigure the specified node 'n' as a master. This function is called when
+ * a node that we believed to be a slave is now acting as master in order to
+ * update the state of the node. */
+void clusterSetNodeAsMaster(clusterNode *n) {
+    if (clusterNodeIsMaster(n)) return;
+
+    if (n->slaveof) {
+        clusterNodeRemoveSlave(n->slaveof,n);
+        if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO;
+    }
+    n->flags &= ~CLUSTER_NODE_SLAVE;
+    n->flags |= CLUSTER_NODE_MASTER;
+    n->slaveof = NULL;
+
+    /* Update config and state. */
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                         CLUSTER_TODO_UPDATE_STATE);
+}
+
+/* This function is called when we receive a master configuration via a
+ * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the
+ * node, and the set of slots claimed under this configEpoch.
+ *
+ * What we do is to rebind the slots with newer configuration compared to our
+ * local configuration, and if needed, we turn ourself into a replica of the
+ * node (see the function comments for more info).
+ *
+ * The 'sender' is the node for which we received a configuration update.
+ * Sometimes it is not actually the "Sender" of the information, like in the
+ * case we receive the info via an UPDATE packet. */
+void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) {
+    int j;
+    clusterNode *curmaster = NULL, *newmaster = NULL;
+    /* The dirty slots list is a list of slots for which we lose the ownership
+     * while having still keys inside. This usually happens after a failover
+     * or after a manual cluster reconfiguration operated by the admin.
+     *
+     * If the update message is not able to demote a master to slave (in this
+     * case we'll resync with the master updating the whole key space), we
+     * need to delete all the keys in the slots we lost ownership. */
+    uint16_t dirty_slots[CLUSTER_SLOTS];
+    int dirty_slots_count = 0;
+
+    /* We should detect if sender is new master of our shard.
+     * We will know it if all our slots were migrated to sender, and sender
+     * has no slots except ours */
+    int sender_slots = 0;
+    int migrated_our_slots = 0;
+
+    /* Here we set curmaster to this node or the node this node
+     * replicates to if it's a slave. In the for loop we are
+     * interested to check if slots are taken away from curmaster. */
+    curmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof;
+
+    if (sender == myself) {
+        serverLog(LL_NOTICE,"Discarding UPDATE message about myself.");
+        return;
+    }
+
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (bitmapTestBit(slots,j)) {
+            sender_slots++;
+
+            /* The slot is already bound to the sender of this message. */
+            if (server.cluster->slots[j] == sender) {
+                bitmapClearBit(server.cluster->owner_not_claiming_slot, j);
+                continue;
+            }
+
+            /* The slot is in importing state, it should be modified only
+             * manually via redis-cli (example: a resharding is in progress
+             * and the migrating side slot was already closed and is advertising
+             * a new config. We still want the slot to be closed manually). */
+            if (server.cluster->importing_slots_from[j]) continue;
+
+            /* We rebind the slot to the new node claiming it if:
+             * 1) The slot was unassigned or the previous owner no longer owns the slot or
+             *    the new node claims it with a greater configEpoch.
+             * 2) We are not currently importing the slot. */
+            if (isSlotUnclaimed(j) ||
+                server.cluster->slots[j]->configEpoch < senderConfigEpoch)
+            {
+                /* Was this slot mine, and still contains keys? Mark it as
+                 * a dirty slot. */
+                if (server.cluster->slots[j] == myself &&
+                    countKeysInSlot(j) &&
+                    sender != myself)
+                {
+                    dirty_slots[dirty_slots_count] = j;
+                    dirty_slots_count++;
+                }
+
+                if (server.cluster->slots[j] == curmaster) {
+                    newmaster = sender;
+                    migrated_our_slots++;
+                }
+                clusterDelSlot(j);
+                clusterAddSlot(sender,j);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                     CLUSTER_TODO_UPDATE_STATE|
+                                     CLUSTER_TODO_FSYNC_CONFIG);
+            }
+        } else if (server.cluster->slots[j] == sender) {
+            /* The slot is currently bound to the sender but the sender is no longer
+             * claiming it. We don't want to unbind the slot yet as it can cause the cluster
+             * to move to FAIL state and also throw client error. Keeping the slot bound to
+             * the previous owner will cause a few client side redirects, but won't throw
+             * any errors. We will keep track of the uncertainty in ownership to avoid
+             * propagating misinformation about this slot's ownership using UPDATE
+             * messages. */
+            bitmapSetBit(server.cluster->owner_not_claiming_slot, j);
+        }
+    }
+
+    /* After updating the slots configuration, don't do any actual change
+     * in the state of the server if a module disabled Redis Cluster
+     * keys redirections. */
+    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
+        return;
+
+    /* If at least one slot was reassigned from a node to another node
+     * with a greater configEpoch, it is possible that:
+     * 1) We are a master left without slots. This means that we were
+     *    failed over and we should turn into a replica of the new
+     *    master.
+     * 2) We are a slave and our master is left without slots. We need
+     *    to replicate to the new slots owner. */
+    if (newmaster && curmaster->numslots == 0 &&
+            (server.cluster_allow_replica_migration ||
+             sender_slots == migrated_our_slots)) {
+        serverLog(LL_NOTICE,
+            "Configuration change detected. Reconfiguring myself "
+            "as a replica of %.40s (%s)", sender->name, sender->human_nodename);
+        clusterSetMaster(sender);
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_UPDATE_STATE|
+                             CLUSTER_TODO_FSYNC_CONFIG);
+    } else if (myself->slaveof && myself->slaveof->slaveof &&
+               /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it
+                * can happen that myself is a replica of a replica of myself. If
+                * this happens, we do nothing to avoid a crash and wait for the
+                * admin to repair the cluster. */
+               myself->slaveof->slaveof != myself)
+    {
+        /* Safeguard against sub-replicas. A replica's master can turn itself
+         * into a replica if its last slot is removed. If no other node takes
+         * over the slot, there is nothing else to trigger replica migration. */
+        serverLog(LL_NOTICE,
+                  "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)",
+                  myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename);
+        clusterSetMaster(myself->slaveof->slaveof);
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_UPDATE_STATE|
+                             CLUSTER_TODO_FSYNC_CONFIG);
+    } else if (dirty_slots_count) {
+        /* If we are here, we received an update message which removed
+         * ownership for certain slots we still have keys about, but still
+         * we are serving some slots, so this master node was not demoted to
+         * a slave.
+         *
+         * In order to maintain a consistent state between keys and slots
+         * we need to remove all the keys from the slots we lost. */
+        for (j = 0; j < dirty_slots_count; j++)
+            delKeysInSlot(dirty_slots[j]);
+    }
+}
+
+/* Cluster ping extensions.
+ *
+ * The ping/pong/meet messages support arbitrary extensions to add additional
+ * metadata to the messages that are sent between the various nodes in the
+ * cluster. The extensions take the form:
+ * [ Header length + type (8 bytes) ] 
+ * [ Extension information (Arbitrary length, but must be 8 byte padded) ]
+ */
+
+
+/* Returns the length of a given extension */
+static uint32_t getPingExtLength(clusterMsgPingExt *ext) {
+    return ntohl(ext->length);
+}
+
+/* Returns the initial position of ping extensions. May return an invalid
+ * address if there are no ping extensions. */
+static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) {
+    clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]);
+    return initial;
+} 
+
+/* Given a current ping extension, returns the start of the next extension. May return
+ * an invalid address if there are no further ping extensions. */
+static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) {
+    clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext));
+    return next;
+}
+
+/* All PING extensions must be 8-byte aligned */
+uint32_t getAlignedPingExtSize(uint32_t dataSize) {
+
+    return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize);
+}
+
+uint32_t getHostnamePingExtSize(void) {
+    if (sdslen(myself->hostname) == 0) {
+        return 0;
+    }
+    return getAlignedPingExtSize(sdslen(myself->hostname) + 1);
+}
+
+uint32_t getHumanNodenamePingExtSize(void) {
+    if (sdslen(myself->human_nodename) == 0) {
+        return 0;
+    }
+    return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1);
+}
+
+uint32_t getShardIdPingExtSize(void) {
+    return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId));
+}
+
+uint32_t getForgottenNodeExtSize(void) {
+    return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode));
+}
+
+void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) {
+    ext->type = htons(type);
+    ext->length = htonl(length);
+    return &ext->ext[0];
+}
+
+clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) {
+    return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length));
+}
+
+/* 1. If a NULL hdr is provided, compute the extension size;
+ * 2. If a non-NULL hdr is provided, write the hostname ping
+ *    extension at the start of the cursor. This function
+ *    will update the cursor to point to the end of the
+ *    written extension and will return the amount of bytes
+ *    written. */
+uint32_t writePingExt(clusterMsg *hdr, int gossipcount)  {
+    uint16_t extensions = 0;
+    uint32_t totlen = 0;
+    clusterMsgPingExt *cursor = NULL;
+    /* Set the initial extension position */
+    if (hdr != NULL) {
+        cursor = getInitialPingExt(hdr, gossipcount);
+    }
+
+    /* hostname is optional */
+    if (sdslen(myself->hostname) != 0) {
+        if (cursor != NULL) {
+            /* Populate hostname */
+            clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize());
+            memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname));
+
+            /* Move the write cursor */
+            cursor = nextPingExt(cursor);
+        }
+
+        totlen += getHostnamePingExtSize();
+        extensions++;
+    }
+
+    if (sdslen(myself->human_nodename) != 0) {
+        if (cursor != NULL) {
+            /* Populate human_nodename */
+            clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize());
+            memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename));
+        
+            /* Move the write cursor */
+            cursor = nextPingExt(cursor);
+        }
+
+        totlen += getHumanNodenamePingExtSize();
+        extensions++;
+    }
+
+    /* Gossip forgotten nodes */
+    if (dictSize(server.cluster->nodes_black_list) > 0) {
+        dictIterator *di = dictGetIterator(server.cluster->nodes_black_list);
+        dictEntry *de;
+        while ((de = dictNext(di)) != NULL) {
+            if (cursor != NULL) {
+                uint64_t expire = dictGetUnsignedIntegerVal(de);
+                if ((time_t)expire < server.unixtime) continue; /* already expired */
+                uint64_t ttl = expire - server.unixtime;
+                clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize());
+                memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN);
+                ext->ttl = htonu64(ttl);
+
+                /* Move the write cursor */
+                cursor = nextPingExt(cursor);
+            }
+            totlen += getForgottenNodeExtSize();
+            extensions++;
+        }
+        dictReleaseIterator(di);
+    }
+
+    /* Populate shard_id */
+    if (cursor != NULL) {
+        clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize());
+        memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN);
+
+        /* Move the write cursor */
+        cursor = nextPingExt(cursor);
+    }
+    totlen += getShardIdPingExtSize();
+    extensions++;
+
+    if (hdr != NULL) {
+        if (extensions != 0) {
+            hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA;
+        }
+        hdr->extensions = htons(extensions);
+    }
+
+    return totlen;
+}
+
+/* We previously validated the extensions, so this function just needs to
+ * handle the extensions. */
+void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) {
+    clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
+    char *ext_hostname = NULL;
+    char *ext_humannodename = NULL;
+    char *ext_shardid = NULL;
+    uint16_t extensions = ntohs(hdr->extensions);
+    /* Loop through all the extensions and process them */
+    clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count));
+    while (extensions--) {
+        uint16_t type = ntohs(ext->type);
+        if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) {
+            clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname);
+            ext_hostname = hostname_ext->hostname;
+        } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) {
+            clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename);
+            ext_humannodename = humannodename_ext->human_nodename;
+        } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) {
+            clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node);
+            clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN);
+            if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) {
+                sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN);
+                dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id);
+                uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl);
+                dictSetUnsignedIntegerVal(de, expire);
+                clusterDelNode(n);
+                clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
+                                     CLUSTER_TODO_SAVE_CONFIG);
+            }
+        } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) {
+            clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id);
+            ext_shardid = shardid_ext->shard_id;
+        } else {
+            /* Unknown type, we will ignore it but log what happened. */
+            serverLog(LL_WARNING, "Received unknown extension type %d", type);
+        }
+
+        /* We know this will be valid since we validated it ahead of time */
+        ext = getNextPingExt(ext);
+    }
+    /* If the node did not send us a hostname extension, assume
+     * they don't have an announced hostname. Otherwise, we'll
+     * set it now. */
+    updateAnnouncedHostname(sender, ext_hostname);
+    updateAnnouncedHumanNodename(sender, ext_humannodename);
+    updateShardId(sender, ext_shardid);
+}
+
+static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) {
+    clusterNode *sender;
+    if (link->node && !nodeInHandshake(link->node)) {
+        /* If the link has an associated node, use that so that we don't have to look it
+         * up every time, except when the node is still in handshake, the node still has
+         * a random name thus not truly "known". */
+        sender = link->node;
+    } else {
+        /* Otherwise, fetch sender based on the message */
+        sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
+        /* We know the sender node but haven't associate it with the link. This must
+         * be an inbound link because only for inbound links we didn't know which node
+         * to associate when they were created. */
+        if (sender && !link->node) {
+            setClusterNodeToInboundClusterLink(sender, link);
+        }
+    }
+    return sender;
+}
+
+/* When this function is called, there is a packet to process starting
+ * at link->rcvbuf. Releasing the buffer is up to the caller, so this
+ * function should just handle the higher level stuff of processing the
+ * packet, modifying the cluster state if needed.
+ *
+ * The function returns 1 if the link is still valid after the packet
+ * was processed, otherwise 0 if the link was freed since the packet
+ * processing lead to some inconsistency error (for instance a PONG
+ * received from the wrong sender ID). */
+int clusterProcessPacket(clusterLink *link) {
+    clusterMsg *hdr = (clusterMsg*) link->rcvbuf;
+    uint32_t totlen = ntohl(hdr->totlen);
+    uint16_t type = ntohs(hdr->type);
+    mstime_t now = mstime();
+
+    if (type < CLUSTERMSG_TYPE_COUNT)
+        server.cluster->stats_bus_messages_received[type]++;
+    serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes",
+        clusterGetMessageTypeString(type), (unsigned long) totlen);
+
+    /* Perform sanity checks */
+    if (totlen < 16) return 1; /* At least signature, version, totlen, count. */
+    if (totlen > link->rcvbuf_len) return 1;
+
+    if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) {
+        /* Can't handle messages of different versions. */
+        return 1;
+    }
+
+    if (type == server.cluster_drop_packet_filter) {
+        serverLog(LL_WARNING, "Dropping packet that matches debug drop filter");
+        return 1;
+    }
+
+    uint16_t flags = ntohs(hdr->flags);
+    uint16_t extensions = ntohs(hdr->extensions);
+    uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0;
+    uint32_t explen; /* expected length of this packet */
+    clusterNode *sender;
+
+    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
+        type == CLUSTERMSG_TYPE_MEET)
+    {
+        uint16_t count = ntohs(hdr->count);
+
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+        explen += (sizeof(clusterMsgDataGossip)*count);
+
+        /* If there is extension data, which doesn't have a fixed length,
+         * loop through them and validate the length of it now. */
+        if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
+            clusterMsgPingExt *ext = getInitialPingExt(hdr, count);
+            while (extensions--) {
+                uint16_t extlen = getPingExtLength(ext);
+                if (extlen % 8 != 0) {
+                    serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)",
+                        clusterGetMessageTypeString(type), (int) extlen);
+                    return 1;
+                }
+                if ((totlen - explen) < extlen) {
+                    serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds "
+                        "total packet length (%lld)", clusterGetMessageTypeString(type),
+                        (unsigned long long) totlen);
+                    return 1;
+                }
+                explen += extlen;
+                ext = getNextPingExt(ext);
+            }
+        }
+    } else if (type == CLUSTERMSG_TYPE_FAIL) {
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+        explen += sizeof(clusterMsgDataFail);
+    } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+        explen += sizeof(clusterMsgDataPublish) -
+                8 +
+                ntohl(hdr->data.publish.msg.channel_len) +
+                ntohl(hdr->data.publish.msg.message_len);
+    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST ||
+               type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK ||
+               type == CLUSTERMSG_TYPE_MFSTART)
+    {
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    } else if (type == CLUSTERMSG_TYPE_UPDATE) {
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+        explen += sizeof(clusterMsgDataUpdate);
+    } else if (type == CLUSTERMSG_TYPE_MODULE) {
+        explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+        explen += sizeof(clusterMsgModule) -
+                3 + ntohl(hdr->data.module.msg.len);
+    } else {
+        /* We don't know this type of packet, so we assume it's well formed. */
+        explen = totlen;
+    }
+
+    if (totlen != explen) {
+        serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld",
+            clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen);
+        return 1;
+    }
+
+    sender = getNodeFromLinkAndMsg(link, hdr);
+
+    /* Update the last time we saw any data from this node. We
+     * use this in order to avoid detecting a timeout from a node that
+     * is just sending a lot of data in the cluster bus, for instance
+     * because of Pub/Sub. */
+    if (sender) sender->data_received = now;
+
+    if (sender && !nodeInHandshake(sender)) {
+        /* Update our currentEpoch if we see a newer epoch in the cluster. */
+        senderCurrentEpoch = ntohu64(hdr->currentEpoch);
+        senderConfigEpoch = ntohu64(hdr->configEpoch);
+        if (senderCurrentEpoch > server.cluster->currentEpoch)
+            server.cluster->currentEpoch = senderCurrentEpoch;
+        /* Update the sender configEpoch if it is publishing a newer one. */
+        if (senderConfigEpoch > sender->configEpoch) {
+            sender->configEpoch = senderConfigEpoch;
+            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                 CLUSTER_TODO_FSYNC_CONFIG);
+        }
+        /* Update the replication offset info for this node. */
+        sender->repl_offset = ntohu64(hdr->offset);
+        sender->repl_offset_time = now;
+        /* If we are a slave performing a manual failover and our master
+         * sent its offset while already paused, populate the MF state. */
+        if (server.cluster->mf_end &&
+            nodeIsSlave(myself) &&
+            myself->slaveof == sender &&
+            hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED &&
+            server.cluster->mf_master_offset == -1)
+        {
+            server.cluster->mf_master_offset = sender->repl_offset;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
+            serverLog(LL_NOTICE,
+                "Received replication offset for paused "
+                "master manual failover: %lld",
+                server.cluster->mf_master_offset);
+        }
+    }
+
+    /* Initial processing of PING and MEET requests replying with a PONG. */
+    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
+        /* We use incoming MEET messages in order to set the address
+         * for 'myself', since only other cluster nodes will send us
+         * MEET messages on handshakes, when the cluster joins, or
+         * later if we changed address, and those nodes will use our
+         * official address to connect to us. So by obtaining this address
+         * from the socket is a simple way to discover / update our own
+         * address in the cluster without it being hardcoded in the config.
+         *
+         * However if we don't have an address at all, we update the address
+         * even with a normal PING packet. If it's wrong it will be fixed
+         * by MEET later. */
+        if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') &&
+            server.cluster_announce_ip == NULL)
+        {
+            char ip[NET_IP_STR_LEN];
+
+            if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 &&
+                strcmp(ip,myself->ip))
+            {
+                memcpy(myself->ip,ip,NET_IP_STR_LEN);
+                serverLog(LL_NOTICE,"IP address for this node updated to %s",
+                    myself->ip);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+            }
+        }
+
+        /* Add this node if it is new for us and the msg type is MEET.
+         * In this stage we don't try to add the node with the right
+         * flags, slaveof pointer, and so forth, as this details will be
+         * resolved when we'll receive PONGs from the node. */
+        if (!sender && type == CLUSTERMSG_TYPE_MEET) {
+            clusterNode *node;
+
+            node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE);
+            serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK);
+            getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
+            node->cport = ntohs(hdr->cport);
+            clusterAddNode(node);
+            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+        }
+
+        /* If this is a MEET packet from an unknown node, we still process
+         * the gossip section here since we have to trust the sender because
+         * of the message type. */
+        if (!sender && type == CLUSTERMSG_TYPE_MEET)
+            clusterProcessGossipSection(hdr,link);
+
+        /* Anyway reply with a PONG */
+        clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
+    }
+
+    /* PING, PONG, MEET: process config information. */
+    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
+        type == CLUSTERMSG_TYPE_MEET)
+    {
+        serverLog(LL_DEBUG,"%s packet received: %.40s",
+            clusterGetMessageTypeString(type),
+            link->node ? link->node->name : "NULL");
+        if (!link->inbound) {
+            if (nodeInHandshake(link->node)) {
+                /* If we already have this node, try to change the
+                 * IP/port of the node with the new one. */
+                if (sender) {
+                    serverLog(LL_VERBOSE,
+                        "Handshake: we already know node %.40s (%s), "
+                        "updating the address if needed.", sender->name, sender->human_nodename);
+                    if (nodeUpdateAddressIfNeeded(sender,link,hdr))
+                    {
+                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                             CLUSTER_TODO_UPDATE_STATE);
+                    }
+                    /* Free this node as we already have it. This will
+                     * cause the link to be freed as well. */
+                    clusterDelNode(link->node);
+                    return 0;
+                }
+
+                /* First thing to do is replacing the random name with the
+                 * right node name if this was a handshake stage. */
+                clusterRenameNode(link->node, hdr->sender);
+                serverLog(LL_DEBUG,"Handshake with node %.40s completed.",
+                    link->node->name);
+                link->node->flags &= ~CLUSTER_NODE_HANDSHAKE;
+                link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+            } else if (memcmp(link->node->name,hdr->sender,
+                        CLUSTER_NAMELEN) != 0)
+            {
+                /* If the reply has a non matching node ID we
+                 * disconnect this node and set it as not having an associated
+                 * address. */
+                serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d",
+                    link->node->name,
+                    (int)(now-(link->node->ctime)),
+                    link->node->flags);
+                link->node->flags |= CLUSTER_NODE_NOADDR;
+                link->node->ip[0] = '\0';
+                link->node->tcp_port = 0;
+                link->node->tls_port = 0;
+                link->node->cport = 0;
+                freeClusterLink(link);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                return 0;
+            }
+        }
+
+        /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender
+         * announced. This is a dynamic flag that we receive from the
+         * sender, and the latest status must be trusted. We need it to
+         * be propagated because the slave ranking used to understand the
+         * delay of each slave in the voting process, needs to know
+         * what are the instances really competing. */
+        if (sender) {
+            int nofailover = flags & CLUSTER_NODE_NOFAILOVER;
+            sender->flags &= ~CLUSTER_NODE_NOFAILOVER;
+            sender->flags |= nofailover;
+        }
+
+        /* Update the node address if it changed. */
+        if (sender && type == CLUSTERMSG_TYPE_PING &&
+            !nodeInHandshake(sender) &&
+            nodeUpdateAddressIfNeeded(sender,link,hdr))
+        {
+            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                 CLUSTER_TODO_UPDATE_STATE);
+        }
+
+        /* Update our info about the node */
+        if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) {
+            link->node->pong_received = now;
+            link->node->ping_sent = 0;
+
+            /* The PFAIL condition can be reversed without external
+             * help if it is momentary (that is, if it does not
+             * turn into a FAIL state).
+             *
+             * The FAIL condition is also reversible under specific
+             * conditions detected by clearNodeFailureIfNeeded(). */
+            if (nodeTimedOut(link->node)) {
+                link->node->flags &= ~CLUSTER_NODE_PFAIL;
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                     CLUSTER_TODO_UPDATE_STATE);
+            } else if (nodeFailed(link->node)) {
+                clearNodeFailureIfNeeded(link->node);
+            }
+        }
+
+        /* Check for role switch: slave -> master or master -> slave. */
+        if (sender) {
+            if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME,
+                sizeof(hdr->slaveof)))
+            {
+                /* Node is a master. */
+                clusterSetNodeAsMaster(sender);
+            } else {
+                /* Node is a slave. */
+                clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN);
+
+                if (clusterNodeIsMaster(sender)) {
+                    /* Master turned into a slave! Reconfigure the node. */
+                    clusterDelNodeSlots(sender);
+                    sender->flags &= ~(CLUSTER_NODE_MASTER|
+                                       CLUSTER_NODE_MIGRATE_TO);
+                    sender->flags |= CLUSTER_NODE_SLAVE;
+
+                    /* Update config and state. */
+                    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                         CLUSTER_TODO_UPDATE_STATE);
+                }
+
+                /* Master node changed for this slave? */
+                if (master && sender->slaveof != master) {
+                    if (sender->slaveof)
+                        clusterNodeRemoveSlave(sender->slaveof,sender);
+                    clusterNodeAddSlave(master,sender);
+                    sender->slaveof = master;
+
+                    /* Update config. */
+                    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                }
+            }
+        }
+
+        /* Update our info about served slots.
+         *
+         * Note: this MUST happen after we update the master/slave state
+         * so that CLUSTER_NODE_MASTER flag will be set. */
+
+        /* Many checks are only needed if the set of served slots this
+         * instance claims is different compared to the set of slots we have
+         * for it. Check this ASAP to avoid other computational expansive
+         * checks later. */
+        clusterNode *sender_master = NULL; /* Sender or its master if slave. */
+        int dirty_slots = 0; /* Sender claimed slots don't match my view? */
+
+        if (sender) {
+            sender_master = clusterNodeIsMaster(sender) ? sender : sender->slaveof;
+            if (sender_master) {
+                dirty_slots = memcmp(sender_master->slots,
+                        hdr->myslots,sizeof(hdr->myslots)) != 0;
+            }
+        }
+
+        /* 1) If the sender of the message is a master, and we detected that
+         *    the set of slots it claims changed, scan the slots to see if we
+         *    need to update our configuration. */
+        if (sender && clusterNodeIsMaster(sender) && dirty_slots)
+            clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots);
+
+        /* 2) We also check for the reverse condition, that is, the sender
+         *    claims to serve slots we know are served by a master with a
+         *    greater configEpoch. If this happens we inform the sender.
+         *
+         * This is useful because sometimes after a partition heals, a
+         * reappearing master may be the last one to claim a given set of
+         * hash slots, but with a configuration that other instances know to
+         * be deprecated. Example:
+         *
+         * A and B are master and slave for slots 1,2,3.
+         * A is partitioned away, B gets promoted.
+         * B is partitioned away, and A returns available.
+         *
+         * Usually B would PING A publishing its set of served slots and its
+         * configEpoch, but because of the partition B can't inform A of the
+         * new configuration, so other nodes that have an updated table must
+         * do it. In this way A will stop to act as a master (or can try to
+         * failover if there are the conditions to win the election). */
+        if (sender && dirty_slots) {
+            int j;
+
+            for (j = 0; j < CLUSTER_SLOTS; j++) {
+                if (bitmapTestBit(hdr->myslots,j)) {
+                    if (server.cluster->slots[j] == sender ||
+                        isSlotUnclaimed(j)) continue;
+                    if (server.cluster->slots[j]->configEpoch >
+                        senderConfigEpoch)
+                    {
+                        serverLog(LL_VERBOSE,
+                            "Node %.40s has old slots configuration, sending "
+                            "an UPDATE message about %.40s",
+                                sender->name, server.cluster->slots[j]->name);
+                        clusterSendUpdate(sender->link,
+                            server.cluster->slots[j]);
+
+                        /* TODO: instead of exiting the loop send every other
+                         * UPDATE packet for other nodes that are the new owner
+                         * of sender's slots. */
+                        break;
+                    }
+                }
+            }
+        }
+
+        /* If our config epoch collides with the sender's try to fix
+         * the problem. */
+        if (sender && clusterNodeIsMaster(myself) && clusterNodeIsMaster(sender) &&
+            senderConfigEpoch == myself->configEpoch)
+        {
+            clusterHandleConfigEpochCollision(sender);
+        }
+
+        /* Get info from the gossip section */
+        if (sender) {
+            clusterProcessGossipSection(hdr,link);
+            clusterProcessPingExtensions(hdr,link);
+        }
+    } else if (type == CLUSTERMSG_TYPE_FAIL) {
+        clusterNode *failing;
+
+        if (sender) {
+            failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN);
+            if (failing &&
+                !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
+            {
+                serverLog(LL_NOTICE,
+                    "FAIL message received from %.40s (%s) about %.40s (%s)",
+                    hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename);
+                failing->flags |= CLUSTER_NODE_FAIL;
+                failing->fail_time = now;
+                failing->flags &= ~CLUSTER_NODE_PFAIL;
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                                     CLUSTER_TODO_UPDATE_STATE);
+            }
+        } else {
+            serverLog(LL_NOTICE,
+                "Ignoring FAIL message from unknown node %.40s about %.40s",
+                hdr->sender, hdr->data.fail.about.nodename);
+        }
+    } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
+        if (!sender) return 1;  /* We don't know that node. */
+
+        robj *channel, *message;
+        uint32_t channel_len, message_len;
+
+        /* Don't bother creating useless objects if there are no
+         * Pub/Sub subscribers. */
+        if ((type == CLUSTERMSG_TYPE_PUBLISH
+            && serverPubsubSubscriptionCount() > 0)
+        || (type == CLUSTERMSG_TYPE_PUBLISHSHARD
+            && serverPubsubShardSubscriptionCount() > 0))
+        {
+            channel_len = ntohl(hdr->data.publish.msg.channel_len);
+            message_len = ntohl(hdr->data.publish.msg.message_len);
+            channel = createStringObject(
+                        (char*)hdr->data.publish.msg.bulk_data,channel_len);
+            message = createStringObject(
+                        (char*)hdr->data.publish.msg.bulk_data+channel_len,
+                        message_len);
+            pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD);
+            decrRefCount(channel);
+            decrRefCount(message);
+        }
+    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
+        if (!sender) return 1;  /* We don't know that node. */
+        clusterSendFailoverAuthIfNeeded(sender,hdr);
+    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
+        if (!sender) return 1;  /* We don't know that node. */
+        /* We consider this vote only if the sender is a master serving
+         * a non zero number of slots, and its currentEpoch is greater or
+         * equal to epoch where this node started the election. */
+        if (clusterNodeIsMaster(sender) && sender->numslots > 0 &&
+            senderCurrentEpoch >= server.cluster->failover_auth_epoch)
+        {
+            server.cluster->failover_auth_count++;
+            /* Maybe we reached a quorum here, set a flag to make sure
+             * we check ASAP. */
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+        }
+    } else if (type == CLUSTERMSG_TYPE_MFSTART) {
+        /* This message is acceptable only if I'm a master and the sender
+         * is one of my slaves. */
+        if (!sender || sender->slaveof != myself) return 1;
+        /* Manual failover requested from slaves. Initialize the state
+         * accordingly. */
+        resetManualFailover();
+        server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
+        server.cluster->mf_slave = sender;
+        pauseActions(PAUSE_DURING_FAILOVER,
+                     now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
+                     PAUSE_ACTIONS_CLIENT_WRITE_SET);
+        serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).",
+            sender->name, sender->human_nodename);
+        /* We need to send a ping message to the replica, as it would carry
+         * `server.cluster->mf_master_offset`, which means the master paused clients
+         * at offset `server.cluster->mf_master_offset`, so that the replica would
+         * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
+         * to complete failover as quickly as possible. */
+        clusterSendPing(link, CLUSTERMSG_TYPE_PING);
+    } else if (type == CLUSTERMSG_TYPE_UPDATE) {
+        clusterNode *n; /* The node the update is about. */
+        uint64_t reportedConfigEpoch =
+                    ntohu64(hdr->data.update.nodecfg.configEpoch);
+
+        if (!sender) return 1;  /* We don't know the sender. */
+        n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN);
+        if (!n) return 1;   /* We don't know the reported node. */
+        if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */
+
+        /* If in our current config the node is a slave, set it as a master. */
+        if (nodeIsSlave(n)) clusterSetNodeAsMaster(n);
+
+        /* Update the node's configEpoch. */
+        n->configEpoch = reportedConfigEpoch;
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_FSYNC_CONFIG);
+
+        /* Check the bitmap of served slots and update our
+         * config accordingly. */
+        clusterUpdateSlotsConfigWith(n,reportedConfigEpoch,
+            hdr->data.update.nodecfg.slots);
+    } else if (type == CLUSTERMSG_TYPE_MODULE) {
+        if (!sender) return 1;  /* Protect the module from unknown nodes. */
+        /* We need to route this message back to the right module subscribed
+         * for the right message type. */
+        uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */
+        uint32_t len = ntohl(hdr->data.module.msg.len);
+        uint8_t type = hdr->data.module.msg.type;
+        unsigned char *payload = hdr->data.module.msg.bulk_data;
+        moduleCallClusterReceivers(sender->name,module_id,type,payload,len);
+    } else {
+        serverLog(LL_WARNING,"Received unknown packet type: %d", type);
+    }
+    return 1;
+}
+
+/* This function is called when we detect the link with this node is lost.
+   We set the node as no longer connected. The Cluster Cron will detect
+   this connection and will try to get it connected again.
+
+   Instead if the node is a temporary node used to accept a query, we
+   completely free the node on error. */
+void handleLinkIOError(clusterLink *link) {
+    freeClusterLink(link);
+}
+
+/* Send the messages queued for the link. */
+void clusterWriteHandler(connection *conn) {
+    clusterLink *link = connGetPrivateData(conn);
+    ssize_t nwritten;
+    size_t totwritten = 0;
+
+    while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) {
+        listNode *head = listFirst(link->send_msg_queue);
+        clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value;
+        clusterMsg *msg = &msgblock->msg;
+        size_t msg_offset = link->head_msg_send_offset;
+        size_t msg_len = ntohl(msg->totlen);
+
+        nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset);
+        if (nwritten <= 0) {
+            serverLog(LL_DEBUG,"I/O error writing to node link: %s",
+                (nwritten == -1) ? connGetLastError(conn) : "short write");
+            handleLinkIOError(link);
+            return;
+        }
+        if (msg_offset + nwritten < msg_len) {
+            /* If full message wasn't written, record the offset
+             * and continue sending from this point next time */
+            link->head_msg_send_offset += nwritten;
+            return;
+        }
+        serverAssert((msg_offset + nwritten) == msg_len);
+        link->head_msg_send_offset = 0;
+
+        /* Delete the node and update our memory tracking */
+        uint32_t blocklen = msgblock->totlen;
+        listDelNode(link->send_msg_queue, head);
+        server.stat_cluster_links_memory -= sizeof(listNode);
+        link->send_msg_queue_mem -= sizeof(listNode) + blocklen;
+
+        totwritten += nwritten;
+    }
+
+    if (listLength(link->send_msg_queue) == 0)
+        connSetWriteHandler(link->conn, NULL);
+}
+
+/* A connect handler that gets called when a connection to another node
+ * gets established.
+ */
+void clusterLinkConnectHandler(connection *conn) {
+    clusterLink *link = connGetPrivateData(conn);
+    clusterNode *node = link->node;
+
+    /* Check if connection succeeded */
+    if (connGetState(conn) != CONN_STATE_CONNECTED) {
+        serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s",
+                node->name, node->ip, node->cport,
+                connGetLastError(conn));
+        freeClusterLink(link);
+        return;
+    }
+
+    /* Register a read handler from now on */
+    connSetReadHandler(conn, clusterReadHandler);
+
+    /* Queue a PING in the new connection ASAP: this is crucial
+     * to avoid false positives in failure detection.
+     *
+     * If the node is flagged as MEET, we send a MEET message instead
+     * of a PING one, to force the receiver to add us in its node
+     * table. */
+    mstime_t old_ping_sent = node->ping_sent;
+    clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ?
+            CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
+    if (old_ping_sent) {
+        /* If there was an active ping before the link was
+         * disconnected, we want to restore the ping time, otherwise
+         * replaced by the clusterSendPing() call. */
+        node->ping_sent = old_ping_sent;
+    }
+    /* We can clear the flag after the first packet is sent.
+     * If we'll never receive a PONG, we'll never send new packets
+     * to this node. Instead after the PONG is received and we
+     * are no longer in meet/handshake status, we want to send
+     * normal PING packets. */
+    node->flags &= ~CLUSTER_NODE_MEET;
+
+    serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d",
+            node->name, node->ip, node->cport);
+}
+
+/* Read data. Try to read the first field of the header first to check the
+ * full length of the packet. When a whole packet is in memory this function
+ * will call the function to process the packet. And so forth. */
+void clusterReadHandler(connection *conn) {
+    clusterMsg buf[1];
+    ssize_t nread;
+    clusterMsg *hdr;
+    clusterLink *link = connGetPrivateData(conn);
+    unsigned int readlen, rcvbuflen;
+
+    while(1) { /* Read as long as there is data to read. */
+        rcvbuflen = link->rcvbuf_len;
+        if (rcvbuflen < 8) {
+            /* First, obtain the first 8 bytes to get the full message
+             * length. */
+            readlen = 8 - rcvbuflen;
+        } else {
+            /* Finally read the full message. */
+            hdr = (clusterMsg*) link->rcvbuf;
+            if (rcvbuflen == 8) {
+                /* Perform some sanity check on the message signature
+                 * and length. */
+                if (memcmp(hdr->sig,"RCmb",4) != 0 ||
+                    ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN)
+                {
+                    char ip[NET_IP_STR_LEN];
+                    int port;
+                    if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) {
+                        serverLog(LL_WARNING,
+                            "Bad message length or signature received "
+                            "on the Cluster bus.");
+                    } else {
+                        serverLog(LL_WARNING,
+                            "Bad message length or signature received "
+                            "on the Cluster bus from %s:%d", ip, port);
+                    }
+                    handleLinkIOError(link);
+                    return;
+                }
+            }
+            readlen = ntohl(hdr->totlen) - rcvbuflen;
+            if (readlen > sizeof(buf)) readlen = sizeof(buf);
+        }
+
+        nread = connRead(conn,buf,readlen);
+        if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */
+
+        if (nread <= 0) {
+            /* I/O error... */
+            serverLog(LL_DEBUG,"I/O error reading from node link: %s",
+                (nread == 0) ? "connection closed" : connGetLastError(conn));
+            handleLinkIOError(link);
+            return;
+        } else {
+            /* Read data and recast the pointer to the new buffer. */
+            size_t unused = link->rcvbuf_alloc - link->rcvbuf_len;
+            if ((size_t)nread > unused) {
+                size_t required = link->rcvbuf_len + nread;
+                size_t prev_rcvbuf_alloc = link->rcvbuf_alloc;
+                /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */
+                link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC;
+                link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc);
+                server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc;
+            }
+            memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread);
+            link->rcvbuf_len += nread;
+            hdr = (clusterMsg*) link->rcvbuf;
+            rcvbuflen += nread;
+        }
+
+        /* Total length obtained? Process this packet. */
+        if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
+            if (clusterProcessPacket(link)) {
+                if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) {
+                    size_t prev_rcvbuf_alloc = link->rcvbuf_alloc;
+                    zfree(link->rcvbuf);
+                    link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
+                    server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc;
+                }
+                link->rcvbuf_len = 0;
+            } else {
+                return; /* Link no longer valid. */
+            }
+        }
+    }
+}
+
+/* Put the message block into the link's send queue.
+ *
+ * It is guaranteed that this function will never have as a side effect
+ * the link to be invalidated, so it is safe to call this function
+ * from event handlers that will do stuff with the same link later. */
+void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
+    if (!link) {
+        return;
+    }
+    if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0)
+        connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1);
+
+    listAddNodeTail(link->send_msg_queue, msgblock);
+    msgblock->refcount++;
+
+    /* Update memory tracking */
+    link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen;
+    server.stat_cluster_links_memory += sizeof(listNode);
+
+    /* Populate sent messages stats. */
+    uint16_t type = ntohs(msgblock->msg.type);
+    if (type < CLUSTERMSG_TYPE_COUNT)
+        server.cluster->stats_bus_messages_sent[type]++;
+}
+
+/* Send a message to all the nodes that are part of the cluster having
+ * a connected link.
+ *
+ * It is guaranteed that this function will never have as a side effect
+ * some node->link to be invalidated, so it is safe to call this function
+ * from event handlers that will do stuff with node links later. */
+void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) {
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
+            continue;
+        clusterSendMessage(node->link,msgblock);
+    }
+    dictReleaseIterator(di);
+}
+
+/* Build the message header. hdr must point to a buffer at least
+ * sizeof(clusterMsg) in bytes. */
+static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) {
+    uint64_t offset;
+    clusterNode *master;
+
+    /* If this node is a master, we send its slots bitmap and configEpoch.
+     * If this node is a slave we send the master's information instead (the
+     * node is flagged as slave so the receiver knows that it is NOT really
+     * in charge for this slots. */
+    master = (nodeIsSlave(myself) && myself->slaveof) ?
+              myself->slaveof : myself;
+
+    hdr->ver = htons(CLUSTER_PROTO_VER);
+    hdr->sig[0] = 'R';
+    hdr->sig[1] = 'C';
+    hdr->sig[2] = 'm';
+    hdr->sig[3] = 'b';
+    hdr->type = htons(type);
+    memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN);
+
+    /* If cluster-announce-ip option is enabled, force the receivers of our
+     * packets to use the specified address for this node. Otherwise if the
+     * first byte is zero, they'll do auto discovery. */
+    memset(hdr->myip,0,NET_IP_STR_LEN);
+    if (server.cluster_announce_ip) {
+        redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN);
+    }
+
+    /* Handle cluster-announce-[tls-|bus-]port. */
+    int announced_tcp_port, announced_tls_port, announced_cport;
+    deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport);
+
+    memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots));
+    memset(hdr->slaveof,0,CLUSTER_NAMELEN);
+    if (myself->slaveof != NULL)
+        memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN);
+    if (server.tls_cluster) {
+        hdr->port = htons(announced_tls_port);
+        hdr->pport = htons(announced_tcp_port);
+    } else {
+        hdr->port = htons(announced_tcp_port);
+        hdr->pport = htons(announced_tls_port);
+    }
+    hdr->cport = htons(announced_cport);
+    hdr->flags = htons(myself->flags);
+    hdr->state = server.cluster->state;
+
+    /* Set the currentEpoch and configEpochs. */
+    hdr->currentEpoch = htonu64(server.cluster->currentEpoch);
+    hdr->configEpoch = htonu64(master->configEpoch);
+
+    /* Set the replication offset. */
+    if (nodeIsSlave(myself))
+        offset = replicationGetSlaveOffset();
+    else
+        offset = server.master_repl_offset;
+    hdr->offset = htonu64(offset);
+
+    /* Set the message flags. */
+    if (clusterNodeIsMaster(myself) && server.cluster->mf_end)
+        hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED;
+
+    hdr->totlen = htonl(msglen);
+}
+
+/* Set the i-th entry of the gossip section in the message pointed by 'hdr'
+ * to the info of the specified node 'n'. */
+void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) {
+    clusterMsgDataGossip *gossip;
+    gossip = &(hdr->data.ping.gossip[i]);
+    memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN);
+    gossip->ping_sent = htonl(n->ping_sent/1000);
+    gossip->pong_received = htonl(n->pong_received/1000);
+    memcpy(gossip->ip,n->ip,sizeof(n->ip));
+    if (server.tls_cluster) {
+        gossip->port = htons(n->tls_port);
+        gossip->pport = htons(n->tcp_port);
+    } else {
+        gossip->port = htons(n->tcp_port);
+        gossip->pport = htons(n->tls_port);
+    }
+    gossip->cport = htons(n->cport);
+    gossip->flags = htons(n->flags);
+    gossip->notused1 = 0;
+}
+
+/* Send a PING or PONG packet to the specified node, making sure to add enough
+ * gossip information. */
+void clusterSendPing(clusterLink *link, int type) {
+    static unsigned long long cluster_pings_sent = 0;
+    cluster_pings_sent++;
+    int gossipcount = 0; /* Number of gossip sections added so far. */
+    int wanted; /* Number of gossip sections we want to append if possible. */
+    int estlen; /* Upper bound on estimated packet length */
+    /* freshnodes is the max number of nodes we can hope to append at all:
+     * nodes available minus two (ourself and the node we are sending the
+     * message to). However practically there may be less valid nodes since
+     * nodes in handshake state, disconnected, are not considered. */
+    int freshnodes = dictSize(server.cluster->nodes)-2;
+
+    /* How many gossip sections we want to add? 1/10 of the number of nodes
+     * and anyway at least 3. Why 1/10?
+     *
+     * If we have N masters, with N/10 entries, and we consider that in
+     * node_timeout we exchange with each other node at least 4 packets
+     * (we ping in the worst case in node_timeout/2 time, and we also
+     * receive two pings from the host), we have a total of 8 packets
+     * in the node_timeout*2 failure reports validity time. So we have
+     * that, for a single PFAIL node, we can expect to receive the following
+     * number of failure reports (in the specified window of time):
+     *
+     * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
+     *
+     * PROB = probability of being featured in a single gossip entry,
+     *        which is 1 / NUM_OF_NODES.
+     * ENTRIES = 10.
+     * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
+     *
+     * If we assume we have just masters (so num of nodes and num of masters
+     * is the same), with 1/10 we always get over the majority, and specifically
+     * 80% of the number of nodes, to account for many masters failing at the
+     * same time.
+     *
+     * Since we have non-voting slaves that lower the probability of an entry
+     * to feature our node, we set the number of entries per packet as
+     * 10% of the total nodes we have. */
+    wanted = floor(dictSize(server.cluster->nodes)/10);
+    if (wanted < 3) wanted = 3;
+    if (wanted > freshnodes) wanted = freshnodes;
+
+    /* Include all the nodes in PFAIL state, so that failure reports are
+     * faster to propagate to go from PFAIL to FAIL state. */
+    int pfail_wanted = server.cluster->stats_pfail_nodes;
+
+    /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen
+     * later according to the number of gossip sections we really were able
+     * to put inside the packet. */
+    estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData);
+    estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted));
+    estlen += writePingExt(NULL, 0);
+    /* Note: clusterBuildMessageHdr() expects the buffer to be always at least
+     * sizeof(clusterMsg) or more. */
+    if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen);
+    clusterMsg *hdr = &msgblock->msg;
+
+    if (!link->inbound && type == CLUSTERMSG_TYPE_PING)
+        link->node->ping_sent = mstime();
+
+    /* Populate the gossip fields */
+    int maxiterations = wanted*3;
+    while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
+        dictEntry *de = dictGetRandomKey(server.cluster->nodes);
+        clusterNode *this = dictGetVal(de);
+
+        /* Don't include this node: the whole packet header is about us
+         * already, so we just gossip about other nodes. */
+        if (this == myself) continue;
+
+        /* PFAIL nodes will be added later. */
+        if (this->flags & CLUSTER_NODE_PFAIL) continue;
+
+        /* In the gossip section don't include:
+         * 1) Nodes in HANDSHAKE state.
+         * 3) Nodes with the NOADDR flag set.
+         * 4) Disconnected nodes if they don't have configured slots.
+         */
+        if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) ||
+            (this->link == NULL && this->numslots == 0))
+        {
+            freshnodes--; /* Technically not correct, but saves CPU. */
+            continue;
+        }
+
+        /* Do not add a node we already have. */
+        if (this->last_in_ping_gossip == cluster_pings_sent) continue;
+
+        /* Add it */
+        clusterSetGossipEntry(hdr,gossipcount,this);
+        this->last_in_ping_gossip = cluster_pings_sent;
+        freshnodes--;
+        gossipcount++;
+    }
+
+    /* If there are PFAIL nodes, add them at the end. */
+    if (pfail_wanted) {
+        dictIterator *di;
+        dictEntry *de;
+
+        di = dictGetSafeIterator(server.cluster->nodes);
+        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
+            clusterNode *node = dictGetVal(de);
+            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
+            if (node->flags & CLUSTER_NODE_NOADDR) continue;
+            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
+            clusterSetGossipEntry(hdr,gossipcount,node);
+            gossipcount++;
+            /* We take the count of the slots we allocated, since the
+             * PFAIL stats may not match perfectly with the current number
+             * of PFAIL nodes. */
+            pfail_wanted--;
+        }
+        dictReleaseIterator(di);
+    }
+
+    /* Compute the actual total length and send! */
+    uint32_t totlen = 0;
+    totlen += writePingExt(hdr, gossipcount);
+    totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
+    serverAssert(gossipcount < USHRT_MAX);
+    hdr->count = htons(gossipcount);
+    hdr->totlen = htonl(totlen);
+
+    clusterSendMessage(link,msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Send a PONG packet to every connected node that's not in handshake state
+ * and for which we have a valid link.
+ *
+ * In Redis Cluster pongs are not used just for failure detection, but also
+ * to carry important configuration information. So broadcasting a pong is
+ * useful when something changes in the configuration and we want to make
+ * the cluster aware ASAP (for instance after a slave promotion).
+ *
+ * The 'target' argument specifies the receiving instances using the
+ * defines below:
+ *
+ * CLUSTER_BROADCAST_ALL -> All known instances.
+ * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring.
+ */
+#define CLUSTER_BROADCAST_ALL 0
+#define CLUSTER_BROADCAST_LOCAL_SLAVES 1
+void clusterBroadcastPong(int target) {
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (!node->link) continue;
+        if (node == myself || nodeInHandshake(node)) continue;
+        if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) {
+            int local_slave =
+                nodeIsSlave(node) && node->slaveof &&
+                (node->slaveof == myself || node->slaveof == myself->slaveof);
+            if (!local_slave) continue;
+        }
+        clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG);
+    }
+    dictReleaseIterator(di);
+}
+
+/* Create a PUBLISH message block.
+ *
+ * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8.
+ * As all the struct is used as a buffer, when more than 8 bytes are copied into
+ * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false
+ * positive in this context. */
+REDIS_NO_SANITIZE("bounds")
+clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) {
+
+    uint32_t channel_len, message_len;
+
+    channel = getDecodedObject(channel);
+    message = getDecodedObject(message);
+    channel_len = sdslen(channel->ptr);
+    message_len = sdslen(message->ptr);
+
+    size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len;
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen);
+
+    clusterMsg *hdr = &msgblock->msg;
+    hdr->data.publish.msg.channel_len = htonl(channel_len);
+    hdr->data.publish.msg.message_len = htonl(message_len);
+    memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr));
+    memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr),
+        message->ptr,sdslen(message->ptr));
+
+    decrRefCount(channel);
+    decrRefCount(message);
+    
+    return msgblock;
+}
+
+/* Send a FAIL message to all the nodes we are able to contact.
+ * The FAIL message is sent when we detect that a node is failing
+ * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this:
+ * we switch the node state to CLUSTER_NODE_FAIL and ask all the other
+ * nodes to do the same ASAP. */
+void clusterSendFail(char *nodename) {
+    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData)
+        + sizeof(clusterMsgDataFail);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen);
+
+    clusterMsg *hdr = &msgblock->msg;
+    memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
+
+    clusterBroadcastMessage(msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Send an UPDATE message to the specified link carrying the specified 'node'
+ * slots configuration. The node name, slots bitmap, and configEpoch info
+ * are included. */
+void clusterSendUpdate(clusterLink *link, clusterNode *node) {
+    if (link == NULL) return;
+
+    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData)
+        + sizeof(clusterMsgDataUpdate);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen);
+
+    clusterMsg *hdr = &msgblock->msg;
+    memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN);
+    hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
+    memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots));
+    for (unsigned int i = 0; i < sizeof(node->slots); i++) {
+        /* Don't advertise slots that the node stopped claiming */
+        hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]);
+    }
+
+    clusterSendMessage(link,msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Send a MODULE message.
+ *
+ * If link is NULL, then the message is broadcasted to the whole cluster. */
+void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type,
+                       const char *payload, uint32_t len) {
+    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    msglen += sizeof(clusterMsgModule) - 3 + len;
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen);
+
+    clusterMsg *hdr = &msgblock->msg;
+    hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */
+    hdr->data.module.msg.type = type;
+    hdr->data.module.msg.len = htonl(len);
+    memcpy(hdr->data.module.msg.bulk_data,payload,len);
+
+    if (link)
+        clusterSendMessage(link,msgblock);
+    else
+        clusterBroadcastMessage(msgblock);
+
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* This function gets a cluster node ID string as target, the same way the nodes
+ * addresses are represented in the modules side, resolves the node, and sends
+ * the message. If the target is NULL the message is broadcasted.
+ *
+ * The function returns C_OK if the target is valid, otherwise C_ERR is
+ * returned. */
+int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) {
+    clusterNode *node = NULL;
+
+    if (target != NULL) {
+        node = clusterLookupNode(target, strlen(target));
+        if (node == NULL || node->link == NULL) return C_ERR;
+    }
+
+    clusterSendModule(target ? node->link : NULL,
+                      module_id, type, payload, len);
+    return C_OK;
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER Pub/Sub support
+ *
+ * If `sharded` is 0:
+ * For now we do very little, just propagating [S]PUBLISH messages across the whole
+ * cluster. In the future we'll try to get smarter and avoiding propagating those
+ * messages to hosts without receives for a given channel.
+ * Otherwise:
+ * Publish this message across the slot (primary/replica).
+ * -------------------------------------------------------------------------- */
+void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
+    clusterMsgSendBlock *msgblock;
+
+    if (!sharded) {
+        msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH);
+        clusterBroadcastMessage(msgblock);
+        clusterMsgSendBlockDecrRefCount(msgblock);
+        return;
+    }
+
+    listIter li;
+    listNode *ln;
+    list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself);
+    serverAssert(nodes_for_slot != NULL);
+    listRewind(nodes_for_slot, &li);
+    msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD);
+    while((ln = listNext(&li))) {
+        clusterNode *node = listNodeValue(ln);
+        if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
+            continue;
+        clusterSendMessage(node->link,msgblock);
+    }
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* -----------------------------------------------------------------------------
+ * SLAVE node specific functions
+ * -------------------------------------------------------------------------- */
+
+/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to
+ * see if there is the quorum for this slave instance to failover its failing
+ * master.
+ *
+ * Note that we send the failover request to everybody, master and slave nodes,
+ * but only the masters are supposed to reply to our query. */
+void clusterRequestFailoverAuth(void) {
+    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen);
+
+    clusterMsg *hdr = &msgblock->msg;
+    /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
+     * in the header to communicate the nodes receiving the message that
+     * they should authorized the failover even if the master is working. */
+    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
+    clusterBroadcastMessage(msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Send a FAILOVER_AUTH_ACK message to the specified node. */
+void clusterSendFailoverAuth(clusterNode *node) {
+    if (!node->link) return;
+
+    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen);
+
+    clusterSendMessage(node->link,msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Send a MFSTART message to the specified node. */
+void clusterSendMFStart(clusterNode *node) {
+    if (!node->link) return;
+
+    uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen);
+
+    clusterSendMessage(node->link,msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* Vote for the node asking for our vote if there are the conditions. */
+void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
+    clusterNode *master = node->slaveof;
+    uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch);
+    uint64_t requestConfigEpoch = ntohu64(request->configEpoch);
+    unsigned char *claimed_slots = request->myslots;
+    int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK;
+    int j;
+
+    /* IF we are not a master serving at least 1 slot, we don't have the
+     * right to vote, as the cluster size in Redis Cluster is the number
+     * of masters serving at least one slot, and quorum is the cluster
+     * size + 1 */
+    if (nodeIsSlave(myself) || myself->numslots == 0) return;
+
+    /* Request epoch must be >= our currentEpoch.
+     * Note that it is impossible for it to actually be greater since
+     * our currentEpoch was updated as a side effect of receiving this
+     * request, if the request epoch was greater. */
+    if (requestCurrentEpoch < server.cluster->currentEpoch) {
+        serverLog(LL_WARNING,
+            "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)",
+            node->name, node->human_nodename,
+            (unsigned long long) requestCurrentEpoch,
+            (unsigned long long) server.cluster->currentEpoch);
+        return;
+    }
+
+    /* I already voted for this epoch? Return ASAP. */
+    if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) {
+        serverLog(LL_WARNING,
+                "Failover auth denied to %.40s (%s): already voted for epoch %llu",
+                node->name, node->human_nodename,
+                (unsigned long long) server.cluster->currentEpoch);
+        return;
+    }
+
+    /* Node must be a slave and its master down.
+     * The master can be non failing if the request is flagged
+     * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */
+    if (clusterNodeIsMaster(node) || master == NULL ||
+        (!nodeFailed(master) && !force_ack))
+    {
+        if (clusterNodeIsMaster(node)) {
+            serverLog(LL_WARNING,
+                    "Failover auth denied to %.40s (%s): it is a master node",
+                    node->name, node->human_nodename);
+        } else if (master == NULL) {
+            serverLog(LL_WARNING,
+                    "Failover auth denied to %.40s (%s): I don't know its master",
+                    node->name, node->human_nodename);
+        } else if (!nodeFailed(master)) {
+            serverLog(LL_WARNING,
+                    "Failover auth denied to %.40s (%s): its master is up",
+                    node->name, node->human_nodename);
+        }
+        return;
+    }
+
+    /* We did not voted for a slave about this master for two
+     * times the node timeout. This is not strictly needed for correctness
+     * of the algorithm but makes the base case more linear. */
+    if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2)
+    {
+        serverLog(LL_WARNING,
+                "Failover auth denied to %.40s %s: "
+                "can't vote about this master before %lld milliseconds",
+                node->name, node->human_nodename,
+                (long long) ((server.cluster_node_timeout*2)-
+                             (mstime() - node->slaveof->voted_time)));
+        return;
+    }
+
+    /* The slave requesting the vote must have a configEpoch for the claimed
+     * slots that is >= the one of the masters currently serving the same
+     * slots in the current configuration. */
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (bitmapTestBit(claimed_slots, j) == 0) continue;
+        if (isSlotUnclaimed(j) ||
+            server.cluster->slots[j]->configEpoch <= requestConfigEpoch)
+        {
+            continue;
+        }
+        /* If we reached this point we found a slot that in our current slots
+         * is served by a master with a greater configEpoch than the one claimed
+         * by the slave requesting our vote. Refuse to vote for this slave. */
+        serverLog(LL_WARNING,
+                "Failover auth denied to %.40s (%s): "
+                "slot %d epoch (%llu) > reqEpoch (%llu)",
+                node->name, node->human_nodename, j,
+                (unsigned long long) server.cluster->slots[j]->configEpoch,
+                (unsigned long long) requestConfigEpoch);
+        return;
+    }
+
+    /* We can vote for this slave. */
+    server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
+    node->slaveof->voted_time = mstime();
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG);
+    clusterSendFailoverAuth(node);
+    serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu",
+        node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch);
+}
+
+/* This function returns the "rank" of this instance, a slave, in the context
+ * of its master-slaves ring. The rank of the slave is given by the number of
+ * other slaves for the same master that have a better replication offset
+ * compared to the local one (better means, greater, so they claim more data).
+ *
+ * A slave with rank 0 is the one with the greatest (most up to date)
+ * replication offset, and so forth. Note that because how the rank is computed
+ * multiple slaves may have the same rank, in case they have the same offset.
+ *
+ * The slave rank is used to add a delay to start an election in order to
+ * get voted and replace a failing master. Slaves with better replication
+ * offsets are more likely to win. */
+int clusterGetSlaveRank(void) {
+    long long myoffset;
+    int j, rank = 0;
+    clusterNode *master;
+
+    serverAssert(nodeIsSlave(myself));
+    master = myself->slaveof;
+    if (master == NULL) return 0; /* Never called by slaves without master. */
+
+    myoffset = replicationGetSlaveOffset();
+    for (j = 0; j < master->numslaves; j++)
+        if (master->slaves[j] != myself &&
+            !nodeCantFailover(master->slaves[j]) &&
+            master->slaves[j]->repl_offset > myoffset) rank++;
+    return rank;
+}
+
+/* This function is called by clusterHandleSlaveFailover() in order to
+ * let the slave log why it is not able to failover. Sometimes there are
+ * not the conditions, but since the failover function is called again and
+ * again, we can't log the same things continuously.
+ *
+ * This function works by logging only if a given set of conditions are
+ * true:
+ *
+ * 1) The reason for which the failover can't be initiated changed.
+ *    The reasons also include a NONE reason we reset the state to
+ *    when the slave finds that its master is fine (no FAIL flag).
+ * 2) Also, the log is emitted again if the master is still down and
+ *    the reason for not failing over is still the same, but more than
+ *    CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
+ * 3) Finally, the function only logs if the slave is down for more than
+ *    five seconds + NODE_TIMEOUT. This way nothing is logged when a
+ *    failover starts in a reasonable time.
+ *
+ * The function is called with the reason why the slave can't failover
+ * which is one of the integer macros CLUSTER_CANT_FAILOVER_*.
+ *
+ * The function is guaranteed to be called only if 'myself' is a slave. */
+void clusterLogCantFailover(int reason) {
+    char *msg;
+    static time_t lastlog_time = 0;
+    mstime_t nolog_fail_time = server.cluster_node_timeout + 5000;
+
+    /* Don't log if we have the same reason for some time. */
+    if (reason == server.cluster->cant_failover_reason &&
+        time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD)
+        return;
+
+    server.cluster->cant_failover_reason = reason;
+
+    /* We also don't emit any log if the master failed no long ago, the
+     * goal of this function is to log slaves in a stalled condition for
+     * a long time. */
+    if (myself->slaveof &&
+        nodeFailed(myself->slaveof) &&
+        (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return;
+
+    switch(reason) {
+    case CLUSTER_CANT_FAILOVER_DATA_AGE:
+        msg = "Disconnected from master for longer than allowed. "
+              "Please check the 'cluster-replica-validity-factor' configuration "
+              "option.";
+        break;
+    case CLUSTER_CANT_FAILOVER_WAITING_DELAY:
+        msg = "Waiting the delay before I can start a new failover.";
+        break;
+    case CLUSTER_CANT_FAILOVER_EXPIRED:
+        msg = "Failover attempt expired.";
+        break;
+    case CLUSTER_CANT_FAILOVER_WAITING_VOTES:
+        msg = "Waiting for votes, but majority still not reached.";
+        break;
+    default:
+        msg = "Unknown reason code.";
+        break;
+    }
+    lastlog_time = time(NULL);
+    serverLog(LL_NOTICE,"Currently unable to failover: %s", msg);
+    
+    int cur_vote = server.cluster->failover_auth_count;
+    int cur_quorum = (server.cluster->size / 2) + 1;
+    /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */
+    if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) {
+        serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote);
+    } 
+}
+
+/* This function implements the final part of automatic and manual failovers,
+ * where the slave grabs its master's hash slots, and propagates the new
+ * configuration.
+ *
+ * Note that it's up to the caller to be sure that the node got a new
+ * configuration epoch already. */
+void clusterFailoverReplaceYourMaster(void) {
+    int j;
+    clusterNode *oldmaster = myself->slaveof;
+
+    if (clusterNodeIsMaster(myself) || oldmaster == NULL) return;
+
+    /* 1) Turn this node into a master. */
+    clusterSetNodeAsMaster(myself);
+    replicationUnsetMaster();
+
+    /* 2) Claim all the slots assigned to our master. */
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (clusterNodeCoversSlot(oldmaster, j)) {
+            clusterDelSlot(j);
+            clusterAddSlot(myself,j);
+        }
+    }
+
+    /* 3) Update state and save config. */
+    clusterUpdateState();
+    clusterSaveConfigOrDie(1);
+
+    /* 4) Pong all the other nodes so that they can update the state
+     *    accordingly and detect that we switched to master role. */
+    clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
+
+    /* 5) If there was a manual failover in progress, clear the state. */
+    resetManualFailover();
+}
+
+/* This function is called if we are a slave node and our master serving
+ * a non-zero amount of hash slots is in FAIL state.
+ *
+ * The goal of this function is:
+ * 1) To check if we are able to perform a failover, is our data updated?
+ * 2) Try to get elected by masters.
+ * 3) Perform the failover informing all the other nodes.
+ */
+void clusterHandleSlaveFailover(void) {
+    mstime_t data_age;
+    mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
+    int needed_quorum = (server.cluster->size / 2) + 1;
+    int manual_failover = server.cluster->mf_end != 0 &&
+                          server.cluster->mf_can_start;
+    mstime_t auth_timeout, auth_retry_time;
+
+    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER;
+
+    /* Compute the failover timeout (the max time we have to send votes
+     * and wait for replies), and the failover retry time (the time to wait
+     * before trying to get voted again).
+     *
+     * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds.
+     * Retry is two times the Timeout.
+     */
+    auth_timeout = server.cluster_node_timeout*2;
+    if (auth_timeout < 2000) auth_timeout = 2000;
+    auth_retry_time = auth_timeout*2;
+
+    /* Pre conditions to run the function, that must be met both in case
+     * of an automatic or manual failover:
+     * 1) We are a slave.
+     * 2) Our master is flagged as FAIL, or this is a manual failover.
+     * 3) We don't have the no failover configuration set, and this is
+     *    not a manual failover.
+     * 4) It is serving slots. */
+    if (clusterNodeIsMaster(myself) ||
+        myself->slaveof == NULL ||
+        (!nodeFailed(myself->slaveof) && !manual_failover) ||
+        (server.cluster_slave_no_failover && !manual_failover) ||
+        myself->slaveof->numslots == 0)
+    {
+        /* There are no reasons to failover, so we set the reason why we
+         * are returning without failing over to NONE. */
+        server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
+        return;
+    }
+
+    /* Set data_age to the number of milliseconds we are disconnected from
+     * the master. */
+    if (server.repl_state == REPL_STATE_CONNECTED) {
+        data_age = (mstime_t)(server.unixtime - server.master->lastinteraction)
+                   * 1000;
+    } else {
+        data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000;
+    }
+
+    /* Remove the node timeout from the data age as it is fine that we are
+     * disconnected from our master at least for the time it was down to be
+     * flagged as FAIL, that's the baseline. */
+    if (data_age > server.cluster_node_timeout)
+        data_age -= server.cluster_node_timeout;
+
+    /* Check if our data is recent enough according to the slave validity
+     * factor configured by the user.
+     *
+     * Check bypassed for manual failovers. */
+    if (server.cluster_slave_validity_factor &&
+        data_age >
+        (((mstime_t)server.repl_ping_slave_period * 1000) +
+         (server.cluster_node_timeout * server.cluster_slave_validity_factor)))
+    {
+        if (!manual_failover) {
+            clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
+            return;
+        }
+    }
+
+    /* If the previous failover attempt timeout and the retry time has
+     * elapsed, we can setup a new one. */
+    if (auth_age > auth_retry_time) {
+        server.cluster->failover_auth_time = mstime() +
+            500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
+            random() % 500; /* Random delay between 0 and 500 milliseconds. */
+        server.cluster->failover_auth_count = 0;
+        server.cluster->failover_auth_sent = 0;
+        server.cluster->failover_auth_rank = clusterGetSlaveRank();
+        /* We add another delay that is proportional to the slave rank.
+         * Specifically 1 second * rank. This way slaves that have a probably
+         * less updated replication offset, are penalized. */
+        server.cluster->failover_auth_time +=
+            server.cluster->failover_auth_rank * 1000;
+        /* However if this is a manual failover, no delay is needed. */
+        if (server.cluster->mf_end) {
+            server.cluster->failover_auth_time = mstime();
+            server.cluster->failover_auth_rank = 0;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+        }
+        serverLog(LL_NOTICE,
+            "Start of election delayed for %lld milliseconds "
+            "(rank #%d, offset %lld).",
+            server.cluster->failover_auth_time - mstime(),
+            server.cluster->failover_auth_rank,
+            replicationGetSlaveOffset());
+        /* Now that we have a scheduled election, broadcast our offset
+         * to all the other slaves so that they'll updated their offsets
+         * if our offset is better. */
+        clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES);
+        return;
+    }
+
+    /* It is possible that we received more updated offsets from other
+     * slaves for the same master since we computed our election delay.
+     * Update the delay if our rank changed.
+     *
+     * Not performed if this is a manual failover. */
+    if (server.cluster->failover_auth_sent == 0 &&
+        server.cluster->mf_end == 0)
+    {
+        int newrank = clusterGetSlaveRank();
+        if (newrank > server.cluster->failover_auth_rank) {
+            long long added_delay =
+                (newrank - server.cluster->failover_auth_rank) * 1000;
+            server.cluster->failover_auth_time += added_delay;
+            server.cluster->failover_auth_rank = newrank;
+            serverLog(LL_NOTICE,
+                "Replica rank updated to #%d, added %lld milliseconds of delay.",
+                newrank, added_delay);
+        }
+    }
+
+    /* Return ASAP if we can't still start the election. */
+    if (mstime() < server.cluster->failover_auth_time) {
+        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
+        return;
+    }
+
+    /* Return ASAP if the election is too old to be valid. */
+    if (auth_age > auth_timeout) {
+        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED);
+        return;
+    }
+
+    /* Ask for votes if needed. */
+    if (server.cluster->failover_auth_sent == 0) {
+        server.cluster->currentEpoch++;
+        server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
+        serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.",
+            (unsigned long long) server.cluster->currentEpoch);
+        clusterRequestFailoverAuth();
+        server.cluster->failover_auth_sent = 1;
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_UPDATE_STATE|
+                             CLUSTER_TODO_FSYNC_CONFIG);
+        return; /* Wait for replies. */
+    }
+
+    /* Check if we reached the quorum. */
+    if (server.cluster->failover_auth_count >= needed_quorum) {
+        /* We have the quorum, we can finally failover the master. */
+
+        serverLog(LL_NOTICE,
+            "Failover election won: I'm the new master.");
+
+        /* Update my configEpoch to the epoch of the election. */
+        if (myself->configEpoch < server.cluster->failover_auth_epoch) {
+            myself->configEpoch = server.cluster->failover_auth_epoch;
+            serverLog(LL_NOTICE,
+                "configEpoch set to %llu after successful failover",
+                (unsigned long long) myself->configEpoch);
+        }
+
+        /* Take responsibility for the cluster slots. */
+        clusterFailoverReplaceYourMaster();
+    } else {
+        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER slave migration
+ *
+ * Slave migration is the process that allows a slave of a master that is
+ * already covered by at least another slave, to "migrate" to a master that
+ * is orphaned, that is, left with no working slaves.
+ * ------------------------------------------------------------------------- */
+
+/* This function is responsible to decide if this replica should be migrated
+ * to a different (orphaned) master. It is called by the clusterCron() function
+ * only if:
+ *
+ * 1) We are a slave node.
+ * 2) It was detected that there is at least one orphaned master in
+ *    the cluster.
+ * 3) We are a slave of one of the masters with the greatest number of
+ *    slaves.
+ *
+ * This checks are performed by the caller since it requires to iterate
+ * the nodes anyway, so we spend time into clusterHandleSlaveMigration()
+ * if definitely needed.
+ *
+ * The function is called with a pre-computed max_slaves, that is the max
+ * number of working (not in FAIL state) slaves for a single master.
+ *
+ * Additional conditions for migration are examined inside the function.
+ */
+void clusterHandleSlaveMigration(int max_slaves) {
+    int j, okslaves = 0;
+    clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL;
+    dictIterator *di;
+    dictEntry *de;
+
+    /* Step 1: Don't migrate if the cluster state is not ok. */
+    if (server.cluster->state != CLUSTER_OK) return;
+
+    /* Step 2: Don't migrate if my master will not be left with at least
+     *         'migration-barrier' slaves after my migration. */
+    if (mymaster == NULL) return;
+    for (j = 0; j < mymaster->numslaves; j++)
+        if (!nodeFailed(mymaster->slaves[j]) &&
+            !nodeTimedOut(mymaster->slaves[j])) okslaves++;
+    if (okslaves <= server.cluster_migration_barrier) return;
+
+    /* Step 3: Identify a candidate for migration, and check if among the
+     * masters with the greatest number of ok slaves, I'm the one with the
+     * smallest node ID (the "candidate slave").
+     *
+     * Note: this means that eventually a replica migration will occur
+     * since slaves that are reachable again always have their FAIL flag
+     * cleared, so eventually there must be a candidate.
+     * There is a possible race condition causing multiple
+     * slaves to migrate at the same time, but this is unlikely to
+     * happen and relatively harmless when it does. */
+    candidate = myself;
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        int okslaves = 0, is_orphaned = 1;
+
+        /* We want to migrate only if this master is working, orphaned, and
+         * used to have slaves or if failed over a master that had slaves
+         * (MIGRATE_TO flag). This way we only migrate to instances that were
+         * supposed to have replicas. */
+        if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0;
+        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0;
+
+        /* Check number of working slaves. */
+        if (clusterNodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node);
+        if (okslaves > 0) is_orphaned = 0;
+
+        if (is_orphaned) {
+            if (!target && node->numslots > 0) target = node;
+
+            /* Track the starting time of the orphaned condition for this
+             * master. */
+            if (!node->orphaned_time) node->orphaned_time = mstime();
+        } else {
+            node->orphaned_time = 0;
+        }
+
+        /* Check if I'm the slave candidate for the migration: attached
+         * to a master with the maximum number of slaves and with the smallest
+         * node ID. */
+        if (okslaves == max_slaves) {
+            for (j = 0; j < node->numslaves; j++) {
+                if (memcmp(node->slaves[j]->name,
+                           candidate->name,
+                           CLUSTER_NAMELEN) < 0)
+                {
+                    candidate = node->slaves[j];
+                }
+            }
+        }
+    }
+    dictReleaseIterator(di);
+
+    /* Step 4: perform the migration if there is a target, and if I'm the
+     * candidate, but only if the master is continuously orphaned for a
+     * couple of seconds, so that during failovers, we give some time to
+     * the natural slaves of this instance to advertise their switch from
+     * the old master to the new one. */
+    if (target && candidate == myself &&
+        (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY &&
+       !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
+    {
+        serverLog(LL_NOTICE,"Migrating to orphaned master %.40s",
+            target->name);
+        clusterSetMaster(target);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER manual failover
+ *
+ * This are the important steps performed by slaves during a manual failover:
+ * 1) User send CLUSTER FAILOVER command. The failover state is initialized
+ *    setting mf_end to the millisecond unix time at which we'll abort the
+ *    attempt.
+ * 2) Slave sends a MFSTART message to the master requesting to pause clients
+ *    for two times the manual failover timeout CLUSTER_MF_TIMEOUT.
+ *    When master is paused for manual failover, it also starts to flag
+ *    packets with CLUSTERMSG_FLAG0_PAUSED.
+ * 3) Slave waits for master to send its replication offset flagged as PAUSED.
+ * 4) If slave received the offset from the master, and its offset matches,
+ *    mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform
+ *    the failover as usually, with the difference that the vote request
+ *    will be modified to force masters to vote for a slave that has a
+ *    working master.
+ *
+ * From the point of view of the master things are simpler: when a
+ * PAUSE_CLIENTS packet is received the master sets mf_end as well and
+ * the sender in mf_slave. During the time limit for the manual failover
+ * the master will just send PINGs more often to this slave, flagged with
+ * the PAUSED flag, so that the slave will set mf_master_offset when receiving
+ * a packet from the master with this flag set.
+ *
+ * The goal of the manual failover is to perform a fast failover without
+ * data loss due to the asynchronous master-slave replication.
+ * -------------------------------------------------------------------------- */
+
+/* Reset the manual failover state. This works for both masters and slaves
+ * as all the state about manual failover is cleared.
+ *
+ * The function can be used both to initialize the manual failover state at
+ * startup or to abort a manual failover in progress. */
+void resetManualFailover(void) {
+    if (server.cluster->mf_slave) {
+        /* We were a master failing over, so we paused clients and related actions.
+         * Regardless of the outcome we unpause now to allow traffic again. */
+        unpauseActions(PAUSE_DURING_FAILOVER);
+    }
+    server.cluster->mf_end = 0; /* No manual failover in progress. */
+    server.cluster->mf_can_start = 0;
+    server.cluster->mf_slave = NULL;
+    server.cluster->mf_master_offset = -1;
+}
+
+/* If a manual failover timed out, abort it. */
+void manualFailoverCheckTimeout(void) {
+    if (server.cluster->mf_end && server.cluster->mf_end < mstime()) {
+        serverLog(LL_WARNING,"Manual failover timed out.");
+        resetManualFailover();
+    }
+}
+
+/* This function is called from the cluster cron function in order to go
+ * forward with a manual failover state machine. */
+void clusterHandleManualFailover(void) {
+    /* Return ASAP if no manual failover is in progress. */
+    if (server.cluster->mf_end == 0) return;
+
+    /* If mf_can_start is non-zero, the failover was already triggered so the
+     * next steps are performed by clusterHandleSlaveFailover(). */
+    if (server.cluster->mf_can_start) return;
+
+    if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */
+
+    if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
+        /* Our replication offset matches the master replication offset
+         * announced after clients were paused. We can start the failover. */
+        server.cluster->mf_can_start = 1;
+        serverLog(LL_NOTICE,
+            "All master replication stream processed, "
+            "manual failover can start.");
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+        return;
+    }
+    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER cron job
+ * -------------------------------------------------------------------------- */
+
+/* Check if the node is disconnected and re-establish the connection.
+ * Also update a few stats while we are here, that can be used to make
+ * better decisions in other part of the code. */
+static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) {
+    /* Not interested in reconnecting the link with myself or nodes
+     * for which we have no address. */
+    if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1;
+
+    if (node->flags & CLUSTER_NODE_PFAIL)
+        server.cluster->stats_pfail_nodes++;
+
+    /* A Node in HANDSHAKE state has a limited lifespan equal to the
+     * configured node timeout. */
+    if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
+        clusterDelNode(node);
+        return 1;
+    }
+
+    if (node->link == NULL) {
+        clusterLink *link = createClusterLink(node);
+        link->conn = connCreate(connTypeOfCluster());
+        connSetPrivateData(link->conn, link);
+        if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr,
+                    clusterLinkConnectHandler) == C_ERR) {
+            /* We got a synchronous error from connect before
+             * clusterSendPing() had a chance to be called.
+             * If node->ping_sent is zero, failure detection can't work,
+             * so we claim we actually sent a ping now (that will
+             * be really sent as soon as the link is obtained). */
+            if (node->ping_sent == 0) node->ping_sent = mstime();
+            serverLog(LL_DEBUG, "Unable to connect to "
+                "Cluster Node [%s]:%d -> %s", node->ip,
+                node->cport, server.neterr);
+
+            freeClusterLink(link);
+            return 0;
+        }
+    }
+    return 0;
+}
+
+static void freeClusterLinkOnBufferLimitReached(clusterLink *link) {
+    if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) {
+        return;
+    }
+
+    unsigned long long mem_link = link->send_msg_queue_mem;
+    if (mem_link > server.cluster_link_msg_queue_limit_bytes) {
+        serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to "
+                "exceeding send buffer memory limit.", link->inbound ? "from" : "to",
+                link->node ? link->node->name : "", mem_link);
+        freeClusterLink(link);
+        server.cluster->stat_cluster_links_buffer_limit_exceeded++;
+    }
+}
+
+/* Free outbound link to a node if its send buffer size exceeded limit. */
+static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) {
+    freeClusterLinkOnBufferLimitReached(node->link);
+    freeClusterLinkOnBufferLimitReached(node->inbound_link);
+}
+
+/* This is executed 10 times every second */
+void clusterCron(void) {
+    dictIterator *di;
+    dictEntry *de;
+    int update_state = 0;
+    int orphaned_masters; /* How many masters there are without ok slaves. */
+    int max_slaves; /* Max number of ok slaves for a single master. */
+    int this_slaves; /* Number of ok slaves for our master (if we are slave). */
+    mstime_t min_pong = 0, now = mstime();
+    clusterNode *min_pong_node = NULL;
+    static unsigned long long iteration = 0;
+    mstime_t handshake_timeout;
+
+    iteration++; /* Number of times this function was called so far. */
+
+    clusterUpdateMyselfHostname();
+
+    /* The handshake timeout is the time after which a handshake node that was
+     * not turned into a normal node is removed from the nodes. Usually it is
+     * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
+     * the value of 1 second. */
+    handshake_timeout = server.cluster_node_timeout;
+    if (handshake_timeout < 1000) handshake_timeout = 1000;
+
+    /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */
+    server.cluster->stats_pfail_nodes = 0;
+    /* Run through some of the operations we want to do on each cluster node. */
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        /* We free the inbound or outboud link to the node if the link has an
+         * oversized message send queue and immediately try reconnecting. */
+        clusterNodeCronFreeLinkOnBufferLimitReached(node);
+        /* The protocol is that function(s) below return non-zero if the node was
+         * terminated.
+         */
+        if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue;
+    }
+    dictReleaseIterator(di); 
+
+    /* Ping some random node 1 time every 10 iterations, so that we usually ping
+     * one random node every second. */
+    if (!(iteration % 10)) {
+        int j;
+
+        /* Check a few random nodes and ping the one with the oldest
+         * pong_received time. */
+        for (j = 0; j < 5; j++) {
+            de = dictGetRandomKey(server.cluster->nodes);
+            clusterNode *this = dictGetVal(de);
+
+            /* Don't ping nodes disconnected or with a ping currently active. */
+            if (this->link == NULL || this->ping_sent != 0) continue;
+            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
+                continue;
+            if (min_pong_node == NULL || min_pong > this->pong_received) {
+                min_pong_node = this;
+                min_pong = this->pong_received;
+            }
+        }
+        if (min_pong_node) {
+            serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
+            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
+        }
+    }
+
+    /* Iterate nodes to check if we need to flag something as failing.
+     * This loop is also responsible to:
+     * 1) Check if there are orphaned masters (masters without non failing
+     *    slaves).
+     * 2) Count the max number of non failing slaves for a single master.
+     * 3) Count the number of slaves for our master, if we are a slave. */
+    orphaned_masters = 0;
+    max_slaves = 0;
+    this_slaves = 0;
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        now = mstime(); /* Use an updated time at every iteration. */
+
+        if (node->flags &
+            (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
+                continue;
+
+        /* Orphaned master check, useful only if the current instance
+         * is a slave that may migrate to another master. */
+        if (nodeIsSlave(myself) && clusterNodeIsMaster(node) && !nodeFailed(node)) {
+            int okslaves = clusterCountNonFailingSlaves(node);
+
+            /* A master is orphaned if it is serving a non-zero number of
+             * slots, have no working slaves, but used to have at least one
+             * slave, or failed over a master that used to have slaves. */
+            if (okslaves == 0 && node->numslots > 0 &&
+                node->flags & CLUSTER_NODE_MIGRATE_TO)
+            {
+                orphaned_masters++;
+            }
+            if (okslaves > max_slaves) max_slaves = okslaves;
+            if (myself->slaveof == node)
+                this_slaves = okslaves;
+        }
+
+        /* If we are not receiving any data for more than half the cluster
+         * timeout, reconnect the link: maybe there is a connection
+         * issue even if the node is alive. */
+        mstime_t ping_delay = now - node->ping_sent;
+        mstime_t data_delay = now - node->data_received;
+        if (node->link && /* is connected */
+            now - node->link->ctime >
+            server.cluster_node_timeout && /* was not already reconnected */
+            node->ping_sent && /* we already sent a ping */
+            /* and we are waiting for the pong more than timeout/2 */
+            ping_delay > server.cluster_node_timeout/2 &&
+            /* and in such interval we are not seeing any traffic at all. */
+            data_delay > server.cluster_node_timeout/2)
+        {
+            /* Disconnect the link, it will be reconnected automatically. */
+            freeClusterLink(node->link);
+        }
+
+        /* If we have currently no active ping in this instance, and the
+         * received PONG is older than half the cluster timeout, send
+         * a new ping now, to ensure all the nodes are pinged without
+         * a too big delay. */
+        mstime_t ping_interval = server.cluster_ping_interval ? 
+            server.cluster_ping_interval : server.cluster_node_timeout/2;
+        if (node->link &&
+            node->ping_sent == 0 &&
+            (now - node->pong_received) > ping_interval)
+        {
+            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
+            continue;
+        }
+
+        /* If we are a master and one of the slaves requested a manual
+         * failover, ping it continuously. */
+        if (server.cluster->mf_end &&
+            clusterNodeIsMaster(myself) &&
+            server.cluster->mf_slave == node &&
+            node->link)
+        {
+            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
+            continue;
+        }
+
+        /* Check only if we have an active ping for this instance. */
+        if (node->ping_sent == 0) continue;
+
+        /* Check if this node looks unreachable.
+         * Note that if we already received the PONG, then node->ping_sent
+         * is zero, so can't reach this code at all, so we don't risk of
+         * checking for a PONG delay if we didn't sent the PING.
+         *
+         * We also consider every incoming data as proof of liveness, since
+         * our cluster bus link is also used for data: under heavy data
+         * load pong delays are possible. */
+        mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
+                                                          data_delay;
+
+        if (node_delay > server.cluster_node_timeout) {
+            /* Timeout reached. Set the node as possibly failing if it is
+             * not already in this state. */
+            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
+                serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
+                    node->name);
+                node->flags |= CLUSTER_NODE_PFAIL;
+                update_state = 1;
+            }
+        }
+    }
+    dictReleaseIterator(di);
+
+    /* If we are a slave node but the replication is still turned off,
+     * enable it if we know the address of our master and it appears to
+     * be up. */
+    if (nodeIsSlave(myself) &&
+        server.masterhost == NULL &&
+        myself->slaveof &&
+        nodeHasAddr(myself->slaveof))
+    {
+        replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof));
+    }
+
+    /* Abort a manual failover if the timeout is reached. */
+    manualFailoverCheckTimeout();
+
+    if (nodeIsSlave(myself)) {
+        clusterHandleManualFailover();
+        if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
+            clusterHandleSlaveFailover();
+        /* If there are orphaned slaves, and we are a slave among the masters
+         * with the max number of non-failing slaves, consider migrating to
+         * the orphaned masters. Note that it does not make sense to try
+         * a migration if there is no master with at least *two* working
+         * slaves. */
+        if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves &&
+            server.cluster_allow_replica_migration)
+            clusterHandleSlaveMigration(max_slaves);
+    }
+
+    if (update_state || server.cluster->state == CLUSTER_FAIL)
+        clusterUpdateState();
+}
+
+/* This function is called before the event handler returns to sleep for
+ * events. It is useful to perform operations that must be done ASAP in
+ * reaction to events fired but that are not safe to perform inside event
+ * handlers, or to perform potentially expansive tasks that we need to do
+ * a single time before replying to clients. */
+void clusterBeforeSleep(void) {
+    int flags = server.cluster->todo_before_sleep;
+
+    /* Reset our flags (not strictly needed since every single function
+     * called for flags set should be able to clear its flag). */
+    server.cluster->todo_before_sleep = 0;
+
+    if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
+        /* Handle manual failover as soon as possible so that won't have a 100ms
+         * as it was handled only in clusterCron */
+        if(nodeIsSlave(myself)) {
+            clusterHandleManualFailover();
+            if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
+                clusterHandleSlaveFailover();
+        }
+    } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
+        /* Handle failover, this is needed when it is likely that there is already
+         * the quorum from masters in order to react fast. */
+        clusterHandleSlaveFailover();
+    }
+
+    /* Update the cluster state. */
+    if (flags & CLUSTER_TODO_UPDATE_STATE)
+        clusterUpdateState();
+
+    /* Save the config, possibly using fsync. */
+    if (flags & CLUSTER_TODO_SAVE_CONFIG) {
+        int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
+        clusterSaveConfigOrDie(fsync);
+    }
+}
+
+void clusterDoBeforeSleep(int flags) {
+    server.cluster->todo_before_sleep |= flags;
+}
+
+/* -----------------------------------------------------------------------------
+ * Slots management
+ * -------------------------------------------------------------------------- */
+
+/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set,
+ * otherwise 0. */
+int bitmapTestBit(unsigned char *bitmap, int pos) {
+    off_t byte = pos/8;
+    int bit = pos&7;
+    return (bitmap[byte] & (1<<bit)) != 0;
+}
+
+/* Set the bit at position 'pos' in a bitmap. */
+void bitmapSetBit(unsigned char *bitmap, int pos) {
+    off_t byte = pos/8;
+    int bit = pos&7;
+    bitmap[byte] |= 1<<bit;
+}
+
+/* Clear the bit at position 'pos' in a bitmap. */
+void bitmapClearBit(unsigned char *bitmap, int pos) {
+    off_t byte = pos/8;
+    int bit = pos&7;
+    bitmap[byte] &= ~(1<<bit);
+}
+
+/* Return non-zero if there is at least one master with slaves in the cluster.
+ * Otherwise zero is returned. Used by clusterNodeSetSlotBit() to set the
+ * MIGRATE_TO flag the when a master gets the first slot. */
+int clusterMastersHaveSlaves(void) {
+    dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
+    dictEntry *de;
+    int slaves = 0;
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (nodeIsSlave(node)) continue;
+        slaves += node->numslaves;
+    }
+    dictReleaseIterator(di);
+    return slaves != 0;
+}
+
+/* Set the slot bit and return the old value. */
+int clusterNodeSetSlotBit(clusterNode *n, int slot) {
+    int old = bitmapTestBit(n->slots,slot);
+    if (!old) {
+        bitmapSetBit(n->slots,slot);
+        n->numslots++;
+        /* When a master gets its first slot, even if it has no slaves,
+         * it gets flagged with MIGRATE_TO, that is, the master is a valid
+         * target for replicas migration, if and only if at least one of
+         * the other masters has slaves right now.
+         *
+         * Normally masters are valid targets of replica migration if:
+         * 1. The used to have slaves (but no longer have).
+         * 2. They are slaves failing over a master that used to have slaves.
+         *
+         * However new masters with slots assigned are considered valid
+         * migration targets if the rest of the cluster is not a slave-less.
+         *
+         * See https://github.com/redis/redis/issues/3043 for more info. */
+        if (n->numslots == 1 && clusterMastersHaveSlaves())
+            n->flags |= CLUSTER_NODE_MIGRATE_TO;
+    }
+    return old;
+}
+
+/* Clear the slot bit and return the old value. */
+int clusterNodeClearSlotBit(clusterNode *n, int slot) {
+    int old = bitmapTestBit(n->slots,slot);
+    if (old) {
+        bitmapClearBit(n->slots,slot);
+        n->numslots--;
+    }
+    return old;
+}
+
+/* Return the slot bit from the cluster node structure. */
+int clusterNodeCoversSlot(clusterNode *n, int slot) {
+    return bitmapTestBit(n->slots,slot);
+}
+
+/* Add the specified slot to the list of slots that node 'n' will
+ * serve. Return C_OK if the operation ended with success.
+ * If the slot is already assigned to another instance this is considered
+ * an error and C_ERR is returned. */
+int clusterAddSlot(clusterNode *n, int slot) {
+    if (server.cluster->slots[slot]) return C_ERR;
+    clusterNodeSetSlotBit(n,slot);
+    server.cluster->slots[slot] = n;
+    return C_OK;
+}
+
+/* Delete the specified slot marking it as unassigned.
+ * Returns C_OK if the slot was assigned, otherwise if the slot was
+ * already unassigned C_ERR is returned. */
+int clusterDelSlot(int slot) {
+    clusterNode *n = server.cluster->slots[slot];
+
+    if (!n) return C_ERR;
+
+    /* Cleanup the channels in master/replica as part of slot deletion. */
+    removeChannelsInSlot(slot);
+    /* Clear the slot bit. */
+    serverAssert(clusterNodeClearSlotBit(n,slot) == 1);
+    server.cluster->slots[slot] = NULL;
+    /* Make owner_not_claiming_slot flag consistent with slot ownership information. */
+    bitmapClearBit(server.cluster->owner_not_claiming_slot, slot);
+    return C_OK;
+}
+
+/* Delete all the slots associated with the specified node.
+ * The number of deleted slots is returned. */
+int clusterDelNodeSlots(clusterNode *node) {
+    int deleted = 0, j;
+
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (clusterNodeCoversSlot(node, j)) {
+            clusterDelSlot(j);
+            deleted++;
+        }
+    }
+    return deleted;
+}
+
+/* Clear the migrating / importing state for all the slots.
+ * This is useful at initialization and when turning a master into slave. */
+void clusterCloseAllSlots(void) {
+    memset(server.cluster->migrating_slots_to,0,
+        sizeof(server.cluster->migrating_slots_to));
+    memset(server.cluster->importing_slots_from,0,
+        sizeof(server.cluster->importing_slots_from));
+}
+
+/* -----------------------------------------------------------------------------
+ * Cluster state evaluation function
+ * -------------------------------------------------------------------------- */
+
+/* The following are defines that are only used in the evaluation function
+ * and are based on heuristics. Actually the main point about the rejoin and
+ * writable delay is that they should be a few orders of magnitude larger
+ * than the network latency. */
+#define CLUSTER_MAX_REJOIN_DELAY 5000
+#define CLUSTER_MIN_REJOIN_DELAY 500
+#define CLUSTER_WRITABLE_DELAY 2000
+
+void clusterUpdateState(void) {
+    int j, new_state;
+    int reachable_masters = 0;
+    static mstime_t among_minority_time;
+    static mstime_t first_call_time = 0;
+
+    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE;
+
+    /* If this is a master node, wait some time before turning the state
+     * into OK, since it is not a good idea to rejoin the cluster as a writable
+     * master, after a reboot, without giving the cluster a chance to
+     * reconfigure this node. Note that the delay is calculated starting from
+     * the first call to this function and not since the server start, in order
+     * to not count the DB loading time. */
+    if (first_call_time == 0) first_call_time = mstime();
+    if (clusterNodeIsMaster(myself) &&
+        server.cluster->state == CLUSTER_FAIL &&
+        mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return;
+
+    /* Start assuming the state is OK. We'll turn it into FAIL if there
+     * are the right conditions. */
+    new_state = CLUSTER_OK;
+
+    /* Check if all the slots are covered. */
+    if (server.cluster_require_full_coverage) {
+        for (j = 0; j < CLUSTER_SLOTS; j++) {
+            if (server.cluster->slots[j] == NULL ||
+                server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL))
+            {
+                new_state = CLUSTER_FAIL;
+                break;
+            }
+        }
+    }
+
+    /* Compute the cluster size, that is the number of master nodes
+     * serving at least a single slot.
+     *
+     * At the same time count the number of reachable masters having
+     * at least one slot. */
+    {
+        dictIterator *di;
+        dictEntry *de;
+
+        server.cluster->size = 0;
+        di = dictGetSafeIterator(server.cluster->nodes);
+        while((de = dictNext(di)) != NULL) {
+            clusterNode *node = dictGetVal(de);
+
+            if (clusterNodeIsMaster(node) && node->numslots) {
+                server.cluster->size++;
+                if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0)
+                    reachable_masters++;
+            }
+        }
+        dictReleaseIterator(di);
+    }
+
+    /* If we are in a minority partition, change the cluster state
+     * to FAIL. */
+    {
+        int needed_quorum = (server.cluster->size / 2) + 1;
+
+        if (reachable_masters < needed_quorum) {
+            new_state = CLUSTER_FAIL;
+            among_minority_time = mstime();
+        }
+    }
+
+    /* Log a state change */
+    if (new_state != server.cluster->state) {
+        mstime_t rejoin_delay = server.cluster_node_timeout;
+
+        /* If the instance is a master and was partitioned away with the
+         * minority, don't let it accept queries for some time after the
+         * partition heals, to make sure there is enough time to receive
+         * a configuration update. */
+        if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY)
+            rejoin_delay = CLUSTER_MAX_REJOIN_DELAY;
+        if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY)
+            rejoin_delay = CLUSTER_MIN_REJOIN_DELAY;
+
+        if (new_state == CLUSTER_OK &&
+            clusterNodeIsMaster(myself) &&
+            mstime() - among_minority_time < rejoin_delay)
+        {
+            return;
+        }
+
+        /* Change the state and log the event. */
+        serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING,
+            "Cluster state changed: %s",
+            new_state == CLUSTER_OK ? "ok" : "fail");
+        server.cluster->state = new_state;
+    }
+}
+
+/* This function is called after the node startup in order to verify that data
+ * loaded from disk is in agreement with the cluster configuration:
+ *
+ * 1) If we find keys about hash slots we have no responsibility for, the
+ *    following happens:
+ *    A) If no other node is in charge according to the current cluster
+ *       configuration, we add these slots to our node.
+ *    B) If according to our config other nodes are already in charge for
+ *       this slots, we set the slots as IMPORTING from our point of view
+ *       in order to justify we have those slots, and in order to make
+ *       redis-cli aware of the issue, so that it can try to fix it.
+ * 2) If we find data in a DB different than DB0 we return C_ERR to
+ *    signal the caller it should quit the server with an error message
+ *    or take other actions.
+ *
+ * The function always returns C_OK even if it will try to correct
+ * the error described in "1". However if data is found in DB different
+ * from DB0, C_ERR is returned.
+ *
+ * The function also uses the logging facility in order to warn the user
+ * about desynchronizations between the data we have in memory and the
+ * cluster configuration. */
+int verifyClusterConfigWithData(void) {
+    int j;
+    int update_config = 0;
+
+    /* Return ASAP if a module disabled cluster redirections. In that case
+     * every master can store keys about every possible hash slot. */
+    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
+        return C_OK;
+
+    /* If this node is a slave, don't perform the check at all as we
+     * completely depend on the replication stream. */
+    if (nodeIsSlave(myself)) return C_OK;
+
+    /* Make sure we only have keys in DB0. */
+    for (j = 1; j < server.dbnum; j++) {
+        if (dbSize(&server.db[j], DB_MAIN)) return C_ERR;
+    }
+
+    /* Check that all the slots we see populated memory have a corresponding
+     * entry in the cluster table. Otherwise fix the table. */
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (!countKeysInSlot(j)) continue; /* No keys in this slot. */
+        /* Check if we are assigned to this slot or if we are importing it.
+         * In both cases check the next slot as the configuration makes
+         * sense. */
+        if (server.cluster->slots[j] == myself ||
+            server.cluster->importing_slots_from[j] != NULL) continue;
+
+        /* If we are here data and cluster config don't agree, and we have
+         * slot 'j' populated even if we are not importing it, nor we are
+         * assigned to this slot. Fix this condition. */
+
+        update_config++;
+        /* Case A: slot is unassigned. Take responsibility for it. */
+        if (server.cluster->slots[j] == NULL) {
+            serverLog(LL_NOTICE, "I have keys for unassigned slot %d. "
+                                    "Taking responsibility for it.",j);
+            clusterAddSlot(myself,j);
+        } else {
+            serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is "
+                                    "assigned to another node. "
+                                    "Setting it to importing state.",j);
+            server.cluster->importing_slots_from[j] = server.cluster->slots[j];
+        }
+    }
+    if (update_config) clusterSaveConfigOrDie(1);
+    return C_OK;
+}
+
+/* Remove all the shard channel related information not owned by the current shard. */
+static inline void removeAllNotOwnedShardChannelSubscriptions(void) {
+    if (!dictSize(server.pubsubshard_channels)) return;
+    clusterNode *currmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof;
+    for (int j = 0; j < CLUSTER_SLOTS; j++) {
+        if (server.cluster->slots[j] != currmaster) {
+            removeChannelsInSlot(j);
+        }
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * SLAVE nodes handling
+ * -------------------------------------------------------------------------- */
+
+/* Set the specified node 'n' as master for this node.
+ * If this node is currently a master, it is turned into a slave. */
+void clusterSetMaster(clusterNode *n) {
+    serverAssert(n != myself);
+    serverAssert(myself->numslots == 0);
+
+    if (clusterNodeIsMaster(myself)) {
+        myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO);
+        myself->flags |= CLUSTER_NODE_SLAVE;
+        clusterCloseAllSlots();
+    } else {
+        if (myself->slaveof)
+            clusterNodeRemoveSlave(myself->slaveof,myself);
+    }
+    myself->slaveof = n;
+    updateShardId(myself, n->shard_id);
+    clusterNodeAddSlave(n,myself);
+    replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n));
+    removeAllNotOwnedShardChannelSubscriptions();
+    resetManualFailover();
+}
+
+/* -----------------------------------------------------------------------------
+ * Nodes to string representation functions.
+ * -------------------------------------------------------------------------- */
+
+struct redisNodeFlags {
+    uint16_t flag;
+    char *name;
+};
+
+static struct redisNodeFlags redisNodeFlagsTable[] = {
+    {CLUSTER_NODE_MYSELF,       "myself,"},
+    {CLUSTER_NODE_MASTER,       "master,"},
+    {CLUSTER_NODE_SLAVE,        "slave,"},
+    {CLUSTER_NODE_PFAIL,        "fail?,"},
+    {CLUSTER_NODE_FAIL,         "fail,"},
+    {CLUSTER_NODE_HANDSHAKE,    "handshake,"},
+    {CLUSTER_NODE_NOADDR,       "noaddr,"},
+    {CLUSTER_NODE_NOFAILOVER,   "nofailover,"}
+};
+
+/* Concatenate the comma separated list of node flags to the given SDS
+ * string 'ci'. */
+sds representClusterNodeFlags(sds ci, uint16_t flags) {
+    size_t orig_len = sdslen(ci);
+    int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags);
+    for (i = 0; i < size; i++) {
+        struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i;
+        if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name);
+    }
+    /* If no flag was added, add the "noflags" special flag. */
+    if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,");
+    sdsIncrLen(ci,-1); /* Remove trailing comma. */
+    return ci;
+}
+
+/* Concatenate the slot ownership information to the given SDS string 'ci'.
+ * If the slot ownership is in a contiguous block, it's represented as start-end pair,
+ * else each slot is added separately. */
+sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) {
+    for (int i = 0; i< slot_info_pairs_count; i+=2) {
+        unsigned long start = slot_info_pairs[i];
+        unsigned long end = slot_info_pairs[i+1];
+        if (start == end) {
+            ci = sdscatfmt(ci, " %i", start);
+        } else {
+            ci = sdscatfmt(ci, " %i-%i", start, end);
+        }
+    }
+    return ci;
+}
+
+/* Generate a csv-alike representation of the specified cluster node.
+ * See clusterGenNodesDescription() top comment for more information.
+ *
+ * The function returns the string representation as an SDS string. */
+sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) {
+    int j, start;
+    sds ci;
+    int port = clusterNodeClientPort(node, tls_primary);
+
+    /* Node coordinates */
+    ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN);
+    ci = sdscatfmt(ci," %s:%i@%i",
+        node->ip,
+        port,
+        node->cport);
+    if (sdslen(node->hostname) != 0) {
+        ci = sdscatfmt(ci,",%s", node->hostname);
+    }
+    /* Don't expose aux fields to any clients yet but do allow them
+     * to be persisted to nodes.conf */
+    if (c == NULL) {
+        if (sdslen(node->hostname) == 0) {
+            ci = sdscatfmt(ci,",", 1);
+        }
+        for (int i = af_count-1; i >=0; i--) {
+            if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) {
+                continue;
+            }
+            if (auxFieldHandlers[i].isPresent(node)) {
+                ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field);
+                ci = auxFieldHandlers[i].getter(node, ci);
+            }
+        }
+    }
+
+    /* Flags */
+    ci = sdscatlen(ci," ",1);
+    ci = representClusterNodeFlags(ci, node->flags);
+
+    /* Slave of... or just "-" */
+    ci = sdscatlen(ci," ",1);
+    if (node->slaveof)
+        ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN);
+    else
+        ci = sdscatlen(ci,"-",1);
+
+    unsigned long long nodeEpoch = node->configEpoch;
+    if (nodeIsSlave(node) && node->slaveof) {
+        nodeEpoch = node->slaveof->configEpoch;
+    }
+    /* Latency from the POV of this node, config epoch, link status */
+    ci = sdscatfmt(ci," %I %I %U %s",
+        (long long) node->ping_sent,
+        (long long) node->pong_received,
+        nodeEpoch,
+        (node->link || node->flags & CLUSTER_NODE_MYSELF) ?
+                    "connected" : "disconnected");
+
+    /* Slots served by this instance. If we already have slots info,
+     * append it directly, otherwise, generate slots only if it has. */
+    if (node->slot_info_pairs) {
+        ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count);
+    } else if (node->numslots > 0) {
+        start = -1;
+        for (j = 0; j < CLUSTER_SLOTS; j++) {
+            int bit;
+
+            if ((bit = clusterNodeCoversSlot(node, j)) != 0) {
+                if (start == -1) start = j;
+            }
+            if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) {
+                if (bit && j == CLUSTER_SLOTS-1) j++;
+
+                if (start == j-1) {
+                    ci = sdscatfmt(ci," %i",start);
+                } else {
+                    ci = sdscatfmt(ci," %i-%i",start,j-1);
+                }
+                start = -1;
+            }
+        }
+    }
+
+    /* Just for MYSELF node we also dump info about slots that
+     * we are migrating to other instances or importing from other
+     * instances. */
+    if (node->flags & CLUSTER_NODE_MYSELF) {
+        for (j = 0; j < CLUSTER_SLOTS; j++) {
+            if (server.cluster->migrating_slots_to[j]) {
+                ci = sdscatprintf(ci," [%d->-%.40s]",j,
+                    server.cluster->migrating_slots_to[j]->name);
+            } else if (server.cluster->importing_slots_from[j]) {
+                ci = sdscatprintf(ci," [%d-<-%.40s]",j,
+                    server.cluster->importing_slots_from[j]->name);
+            }
+        }
+    }
+    return ci;
+}
+
+/* Generate the slot topology for all nodes and store the string representation
+ * in the slots_info struct on the node. This is used to improve the efficiency
+ * of clusterGenNodesDescription() because it removes looping of the slot space
+ * for generating the slot info for each node individually. */
+void clusterGenNodesSlotsInfo(int filter) {
+    clusterNode *n = NULL;
+    int start = -1;
+
+    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
+        /* Find start node and slot id. */
+        if (n == NULL) {
+            if (i == CLUSTER_SLOTS) break;
+            n = server.cluster->slots[i];
+            start = i;
+            continue;
+        }
+
+        /* Generate slots info when occur different node with start
+         * or end of slot. */
+        if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) {
+            if (!(n->flags & filter)) {
+                if (!n->slot_info_pairs) {
+                    n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t));
+                }
+                serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots));
+                n->slot_info_pairs[n->slot_info_pairs_count++] = start;
+                n->slot_info_pairs[n->slot_info_pairs_count++] = i-1;
+            }
+            if (i == CLUSTER_SLOTS) break;
+            n = server.cluster->slots[i];
+            start = i;
+        }
+    }
+}
+
+void clusterFreeNodesSlotsInfo(clusterNode *n) {
+    zfree(n->slot_info_pairs);
+    n->slot_info_pairs = NULL;
+    n->slot_info_pairs_count = 0;
+}
+
+/* Generate a csv-alike representation of the nodes we are aware of,
+ * including the "myself" node, and return an SDS string containing the
+ * representation (it is up to the caller to free it).
+ *
+ * All the nodes matching at least one of the node flags specified in
+ * "filter" are excluded from the output, so using zero as a filter will
+ * include all the known nodes in the representation, including nodes in
+ * the HANDSHAKE state.
+ *
+ * Setting tls_primary to 1 to put TLS port in the main <ip>:<port> 
+ * field and put TCP port in aux field, instead of the opposite way.
+ *
+ * The representation obtained using this function is used for the output
+ * of the CLUSTER NODES function, and as format for the cluster
+ * configuration file (nodes.conf) for a given node. */
+sds clusterGenNodesDescription(client *c, int filter, int tls_primary) {
+    sds ci = sdsempty(), ni;
+    dictIterator *di;
+    dictEntry *de;
+
+    /* Generate all nodes slots info firstly. */
+    clusterGenNodesSlotsInfo(filter);
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        if (node->flags & filter) continue;
+        ni = clusterGenNodeDescription(c, node, tls_primary);
+        ci = sdscatsds(ci,ni);
+        sdsfree(ni);
+        ci = sdscatlen(ci,"\n",1);
+
+        /* Release slots info. */
+        clusterFreeNodesSlotsInfo(node);
+    }
+    dictReleaseIterator(di);
+    return ci;
+}
+
+/* Add to the output buffer of the given client the description of the given cluster link.
+ * The description is a map with each entry being an attribute of the link. */
+void addReplyClusterLinkDescription(client *c, clusterLink *link) {
+    addReplyMapLen(c, 6);
+
+    addReplyBulkCString(c, "direction");
+    addReplyBulkCString(c, link->inbound ? "from" : "to");
+
+    /* addReplyClusterLinkDescription is only called for links that have been
+     * associated with nodes. The association is always bi-directional, so
+     * in addReplyClusterLinkDescription, link->node should never be NULL. */
+    serverAssert(link->node);
+    sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN);
+    addReplyBulkCString(c, "node");
+    addReplyBulkCString(c, node_name);
+    sdsfree(node_name);
+
+    addReplyBulkCString(c, "create-time");
+    addReplyLongLong(c, link->ctime);
+
+    char events[3], *p;
+    p = events;
+    if (link->conn) {
+        if (connHasReadHandler(link->conn)) *p++ = 'r';
+        if (connHasWriteHandler(link->conn)) *p++ = 'w';
+    }
+    *p = '\0';
+    addReplyBulkCString(c, "events");
+    addReplyBulkCString(c, events);
+
+    addReplyBulkCString(c, "send-buffer-allocated");
+    addReplyLongLong(c, link->send_msg_queue_mem);
+
+    addReplyBulkCString(c, "send-buffer-used");
+    addReplyLongLong(c, link->send_msg_queue_mem);
+}
+
+/* Add to the output buffer of the given client an array of cluster link descriptions,
+ * with array entry being a description of a single current cluster link. */
+void addReplyClusterLinksDescription(client *c) {
+    dictIterator *di;
+    dictEntry *de;
+    void *arraylen_ptr = NULL;
+    int num_links = 0;
+
+    arraylen_ptr = addReplyDeferredLen(c);
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        if (node->link) {
+            num_links++;
+            addReplyClusterLinkDescription(c, node->link);
+        }
+        if (node->inbound_link) {
+            num_links++;
+            addReplyClusterLinkDescription(c, node->inbound_link);
+        }
+    }
+    dictReleaseIterator(di);
+
+    setDeferredArrayLen(c, arraylen_ptr, num_links);
+}
+
+/* -----------------------------------------------------------------------------
+ * CLUSTER command
+ * -------------------------------------------------------------------------- */
+
+const char *clusterGetMessageTypeString(int type) {
+    switch(type) {
+    case CLUSTERMSG_TYPE_PING: return "ping";
+    case CLUSTERMSG_TYPE_PONG: return "pong";
+    case CLUSTERMSG_TYPE_MEET: return "meet";
+    case CLUSTERMSG_TYPE_FAIL: return "fail";
+    case CLUSTERMSG_TYPE_PUBLISH: return "publish";
+    case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard";
+    case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req";
+    case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack";
+    case CLUSTERMSG_TYPE_UPDATE: return "update";
+    case CLUSTERMSG_TYPE_MFSTART: return "mfstart";
+    case CLUSTERMSG_TYPE_MODULE: return "module";
+    }
+    return "unknown";
+}
+
+int getSlotOrReply(client *c, robj *o) {
+    long long slot;
+
+    if (getLongLongFromObject(o,&slot) != C_OK ||
+        slot < 0 || slot >= CLUSTER_SLOTS)
+    {
+        addReplyError(c,"Invalid or out of range slot");
+        return -1;
+    }
+    return (int) slot;
+}
+
+int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) {
+    int slot;
+    for (slot = start_slot; slot <= end_slot; slot++) {
+        if (del && server.cluster->slots[slot] == NULL) {
+            addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
+            return C_ERR;
+        } else if (!del && server.cluster->slots[slot]) {
+            addReplyErrorFormat(c,"Slot %d is already busy", slot);
+            return C_ERR;
+        }
+        if (slots[slot]++ == 1) {
+            addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot);
+            return C_ERR;
+        }
+    }
+    return C_OK;
+}
+
+void clusterUpdateSlots(client *c, unsigned char *slots, int del) {
+    int j;
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        if (slots[j]) {
+            int retval;
+                
+            /* If this slot was set as importing we can clear this
+             * state as now we are the real owner of the slot. */
+            if (server.cluster->importing_slots_from[j])
+                server.cluster->importing_slots_from[j] = NULL;
+
+            retval = del ? clusterDelSlot(j) :
+                           clusterAddSlot(myself,j);
+            serverAssertWithInfo(c,NULL,retval == C_OK);
+        }
+    }
+}
+
+/* Add detailed information of a node to the output buffer of the given client. */
+void addNodeDetailsToShardReply(client *c, clusterNode *node) {
+    int reply_count = 0;
+    void *node_replylen = addReplyDeferredLen(c);
+    addReplyBulkCString(c, "id");
+    addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN);
+    reply_count++;
+
+    if (node->tcp_port) {
+        addReplyBulkCString(c, "port");
+        addReplyLongLong(c, node->tcp_port);
+        reply_count++;
+    }
+
+    if (node->tls_port) {
+        addReplyBulkCString(c, "tls-port");
+        addReplyLongLong(c, node->tls_port);
+        reply_count++;
+    }
+
+    addReplyBulkCString(c, "ip");
+    addReplyBulkCString(c, node->ip);
+    reply_count++;
+
+    addReplyBulkCString(c, "endpoint");
+    addReplyBulkCString(c, clusterNodePreferredEndpoint(node));
+    reply_count++;
+
+    if (sdslen(node->hostname) != 0) {
+        addReplyBulkCString(c, "hostname");
+        addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname));
+        reply_count++;
+    }
+
+    long long node_offset;
+    if (node->flags & CLUSTER_NODE_MYSELF) {
+        node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset;
+    } else {
+        node_offset = node->repl_offset;
+    }
+
+    addReplyBulkCString(c, "role");
+    addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master");
+    reply_count++;
+
+    addReplyBulkCString(c, "replication-offset");
+    addReplyLongLong(c, node_offset);
+    reply_count++;
+
+    addReplyBulkCString(c, "health");
+    const char *health_msg = NULL;
+    if (nodeFailed(node)) {
+        health_msg = "fail";
+    } else if (nodeIsSlave(node) && node_offset == 0) {
+        health_msg = "loading";
+    } else {
+        health_msg = "online";
+    }
+    addReplyBulkCString(c, health_msg);
+    reply_count++;
+
+    setDeferredMapLen(c, node_replylen, reply_count);
+}
+
+/* Add the shard reply of a single shard based off the given primary node. */
+void addShardReplyForClusterShards(client *c, list *nodes) {
+    serverAssert(listLength(nodes) > 0);
+    clusterNode *n = listNodeValue(listFirst(nodes));
+    addReplyMapLen(c, 2);
+    addReplyBulkCString(c, "slots");
+
+    /* Use slot_info_pairs from the primary only */
+    while (n->slaveof != NULL) n = n->slaveof;
+
+    if (n->slot_info_pairs != NULL) {
+        serverAssert((n->slot_info_pairs_count % 2) == 0);
+        addReplyArrayLen(c, n->slot_info_pairs_count);
+        for (int i = 0; i < n->slot_info_pairs_count; i++)
+            addReplyLongLong(c, (unsigned long)n->slot_info_pairs[i]);
+    } else {
+        /* If no slot info pair is provided, the node owns no slots */
+        addReplyArrayLen(c, 0);
+    }
+
+    addReplyBulkCString(c, "nodes");
+    addReplyArrayLen(c, listLength(nodes));
+    listIter li;
+    listRewind(nodes, &li);
+    for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) {
+        clusterNode *n = listNodeValue(ln);
+        addNodeDetailsToShardReply(c, n);
+        clusterFreeNodesSlotsInfo(n);
+    }
+}
+
+/* Add to the output buffer of the given client, an array of slot (start, end)
+ * pair owned by the shard, also the primary and set of replica(s) along with
+ * information about each node. */
+void clusterCommandShards(client *c) {
+    addReplyArrayLen(c, dictSize(server.cluster->shards));
+    /* This call will add slot_info_pairs to all nodes */
+    clusterGenNodesSlotsInfo(0);
+    dictIterator *di = dictGetSafeIterator(server.cluster->shards);
+    for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) {
+        addShardReplyForClusterShards(c, dictGetVal(de));
+    }
+    dictReleaseIterator(di);
+}
+
+sds genClusterInfoString(void) {
+    sds info = sdsempty();
+    char *statestr[] = {"ok","fail"};
+    int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0;
+    uint64_t myepoch;
+    int j;
+
+    for (j = 0; j < CLUSTER_SLOTS; j++) {
+        clusterNode *n = server.cluster->slots[j];
+
+        if (n == NULL) continue;
+        slots_assigned++;
+        if (nodeFailed(n)) {
+            slots_fail++;
+        } else if (nodeTimedOut(n)) {
+            slots_pfail++;
+        } else {
+            slots_ok++;
+        }
+    }
+
+    myepoch = (nodeIsSlave(myself) && myself->slaveof) ?
+                myself->slaveof->configEpoch : myself->configEpoch;
+
+    info = sdscatprintf(info,
+        "cluster_state:%s\r\n"
+        "cluster_slots_assigned:%d\r\n"
+        "cluster_slots_ok:%d\r\n"
+        "cluster_slots_pfail:%d\r\n"
+        "cluster_slots_fail:%d\r\n"
+        "cluster_known_nodes:%lu\r\n"
+        "cluster_size:%d\r\n"
+        "cluster_current_epoch:%llu\r\n"
+        "cluster_my_epoch:%llu\r\n"
+        , statestr[server.cluster->state],
+        slots_assigned,
+        slots_ok,
+        slots_pfail,
+        slots_fail,
+        dictSize(server.cluster->nodes),
+        server.cluster->size,
+        (unsigned long long) server.cluster->currentEpoch,
+        (unsigned long long) myepoch
+    );
+
+    /* Show stats about messages sent and received. */
+    long long tot_msg_sent = 0;
+    long long tot_msg_received = 0;
+
+    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
+        if (server.cluster->stats_bus_messages_sent[i] == 0) continue;
+        tot_msg_sent += server.cluster->stats_bus_messages_sent[i];
+        info = sdscatprintf(info,
+            "cluster_stats_messages_%s_sent:%lld\r\n",
+            clusterGetMessageTypeString(i),
+            server.cluster->stats_bus_messages_sent[i]);
+    }
+    info = sdscatprintf(info,
+        "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent);
+
+    for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
+        if (server.cluster->stats_bus_messages_received[i] == 0) continue;
+        tot_msg_received += server.cluster->stats_bus_messages_received[i];
+        info = sdscatprintf(info,
+            "cluster_stats_messages_%s_received:%lld\r\n",
+            clusterGetMessageTypeString(i),
+            server.cluster->stats_bus_messages_received[i]);
+    }
+    info = sdscatprintf(info,
+        "cluster_stats_messages_received:%lld\r\n", tot_msg_received);
+
+    info = sdscatprintf(info,
+        "total_cluster_links_buffer_limit_exceeded:%llu\r\n",
+        server.cluster->stat_cluster_links_buffer_limit_exceeded);
+
+    return info;
+}
+
+
+void removeChannelsInSlot(unsigned int slot) {
+    unsigned int channelcount = countChannelsInSlot(slot);
+    if (channelcount == 0) return;
+
+    /* Retrieve all the channels for the slot. */
+    robj **channels = zmalloc(sizeof(robj*)*channelcount);
+    raxIterator iter;
+    int j = 0;
+    unsigned char indexed[2];
+
+    indexed[0] = (slot >> 8) & 0xff;
+    indexed[1] = slot & 0xff;
+    raxStart(&iter,server.cluster->slots_to_channels);
+    raxSeek(&iter,">=",indexed,2);
+    while(raxNext(&iter)) {
+        if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break;
+        channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2);
+    }
+    raxStop(&iter);
+
+    pubsubUnsubscribeShardChannels(channels, channelcount);
+    zfree(channels);
+}
+
+
+
+/* Remove all the keys in the specified hash slot.
+ * The number of removed items is returned. */
+unsigned int delKeysInSlot(unsigned int hashslot) {
+    unsigned int j = 0;
+
+    dictIterator *iter = NULL;
+    dictEntry *de = NULL;
+    iter = dictGetSafeIterator(server.db->dict[hashslot]);
+    while((de = dictNext(iter)) != NULL) {
+        sds sdskey = dictGetKey(de);
+        robj *key = createStringObject(sdskey, sdslen(sdskey));
+        dbDelete(&server.db[0], key);
+        propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del);
+        signalModifiedKey(NULL, &server.db[0], key);
+        /* The keys are not actually logically deleted from the database, just moved to another node.
+         * The modules needs to know that these keys are no longer available locally, so just send the
+         * keyspace notification to the modules, but not to clients. */
+        moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id);
+        postExecutionUnitOperations();
+        decrRefCount(key);
+        j++;
+        server.dirty++;
+    }
+    dictReleaseIterator(iter);
+
+    return j;
+}
+
+/* -----------------------------------------------------------------------------
+ * Operation(s) on channel rax tree.
+ * -------------------------------------------------------------------------- */
+
+void slotToChannelUpdate(sds channel, int add) {
+    size_t keylen = sdslen(channel);
+    unsigned int hashslot = keyHashSlot(channel,keylen);
+    unsigned char buf[64];
+    unsigned char *indexed = buf;
+
+    if (keylen+2 > 64) indexed = zmalloc(keylen+2);
+    indexed[0] = (hashslot >> 8) & 0xff;
+    indexed[1] = hashslot & 0xff;
+    memcpy(indexed+2,channel,keylen);
+    if (add) {
+        raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL);
+    } else {
+        raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL);
+    }
+    if (indexed != buf) zfree(indexed);
+}
+
+void slotToChannelAdd(sds channel) {
+    slotToChannelUpdate(channel,1);
+}
+
+void slotToChannelDel(sds channel) {
+    slotToChannelUpdate(channel,0);
+}
+
+/* Get the count of the channels for a given slot. */
+unsigned int countChannelsInSlot(unsigned int hashslot) {
+    raxIterator iter;
+    int j = 0;
+    unsigned char indexed[2];
+
+    indexed[0] = (hashslot >> 8) & 0xff;
+    indexed[1] = hashslot & 0xff;
+    raxStart(&iter,server.cluster->slots_to_channels);
+    raxSeek(&iter,">=",indexed,2);
+    while(raxNext(&iter)) {
+        if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break;
+        j++;
+    }
+    raxStop(&iter);
+    return j;
+}
+
+int clusterNodeIsMyself(clusterNode *n) {
+    return n == server.cluster->myself;
+}
+
+clusterNode *getMyClusterNode(void) {
+    return server.cluster->myself;
+}
+
+int clusterManualFailoverTimeLimit(void) {
+    return server.cluster->mf_end;
+}
+
+int getClusterSize(void) {
+    return dictSize(server.cluster->nodes);
+}
+
+char **getClusterNodesList(size_t *numnodes) {
+    size_t count = dictSize(server.cluster->nodes);
+    char **ids = zmalloc((count+1)*CLUSTER_NAMELEN);
+    dictIterator *di = dictGetIterator(server.cluster->nodes);
+    dictEntry *de;
+    int j = 0;
+    while((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue;
+        ids[j] = zmalloc(CLUSTER_NAMELEN);
+        memcpy(ids[j],node->name,CLUSTER_NAMELEN);
+        j++;
+    }
+    *numnodes = j;
+    ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need
+                    * to also get the count argument. */
+    dictReleaseIterator(di);
+    return ids;
+}
+
+int clusterNodeIsMaster(clusterNode *n) {
+    return n->flags & CLUSTER_NODE_MASTER;
+}
+
+int handleDebugClusterCommand(client *c) {
+    if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") ||
+        strcasecmp(c->argv[2]->ptr, "KILL") ||
+        c->argc != 5) {
+        return 0;
+    }
+
+    if (!server.cluster_enabled) {
+        addReplyError(c, "Debug option only available for cluster mode enabled setup!");
+        return 1;
+    }
+
+    /* Find the node. */
+    clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
+    if (!n) {
+        addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr);
+        return 1;
+    }
+
+    /* Terminate the link based on the direction or all. */
+    if (!strcasecmp(c->argv[3]->ptr, "from")) {
+        freeClusterLink(n->inbound_link);
+    } else if (!strcasecmp(c->argv[3]->ptr, "to")) {
+        freeClusterLink(n->link);
+    } else if (!strcasecmp(c->argv[3]->ptr, "all")) {
+        freeClusterLink(n->link);
+        freeClusterLink(n->inbound_link);
+    } else {
+        addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr);
+    }
+    addReply(c, shared.ok);
+
+    return 1;
+}
+
+int clusterNodePending(clusterNode  *node) {
+    return node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE);
+}
+
+char *clusterNodeIp(clusterNode *node) {
+    return node->ip;
+}
+
+int clusterNodeIsSlave(clusterNode *node) {
+    return node->flags & CLUSTER_NODE_SLAVE;
+}
+
+clusterNode *clusterNodeGetSlaveof(clusterNode *node) {
+    return node->slaveof;
+}
+
+char *clusterNodeGetName(clusterNode *node) {
+    return node->name;
+}
+
+int clusterNodeTimedOut(clusterNode *node) {
+    return nodeTimedOut(node);
+}
+
+int clusterNodeIsFailing(clusterNode *node) {
+    return nodeFailed(node);
+}
+
+int clusterNodeIsNoFailover(clusterNode *node) {
+    return node->flags & CLUSTER_NODE_NOFAILOVER;
+}
+
+const char **clusterDebugCommandExtendedHelp(void) {
+    static const char *help[] = {
+        "CLUSTERLINK KILL <to|from|all> <node-id>",
+        "    Kills the link based on the direction to/from (both) with the provided node.",
+        NULL
+    };
+
+    return help;
+}
+
+char *clusterNodeGetShardId(clusterNode *node) {
+    return node->shard_id;
+}
+
+int clusterCommandSpecial(client *c) {
+    if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) {
+        /* CLUSTER MEET <ip> <port> [cport] */
+        long long port, cport;
+
+        if (getLongLongFromObject(c->argv[3], &port) != C_OK) {
+            addReplyErrorFormat(c,"Invalid base port specified: %s",
+                                (char*)c->argv[3]->ptr);
+            return 1;
+        }
+
+        if (c->argc == 5) {
+            if (getLongLongFromObject(c->argv[4], &cport) != C_OK) {
+                addReplyErrorFormat(c,"Invalid bus port specified: %s",
+                                    (char*)c->argv[4]->ptr);
+                return 1;
+            }
+        } else {
+            cport = port + CLUSTER_PORT_INCR;
+        }
+
+        if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 &&
+            errno == EINVAL)
+        {
+            addReplyErrorFormat(c,"Invalid node address specified: %s:%s",
+                            (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr);
+        } else {
+            addReply(c,shared.ok);
+        }
+    } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) {
+        /* CLUSTER FLUSHSLOTS */
+        if (dbSize(&server.db[0], DB_MAIN) != 0) {
+            addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS.");
+            return 1;
+        }
+        clusterDelNodeSlots(myself);
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+        addReply(c,shared.ok);
+    } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
+                !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) {
+        /* CLUSTER ADDSLOTS <slot> [slot] ... */
+        /* CLUSTER DELSLOTS <slot> [slot] ... */
+        int j, slot;
+        unsigned char *slots = zmalloc(CLUSTER_SLOTS);
+        int del = !strcasecmp(c->argv[1]->ptr,"delslots");
+
+        memset(slots,0,CLUSTER_SLOTS);
+        /* Check that all the arguments are parseable.*/
+        for (j = 2; j < c->argc; j++) {
+            if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
+                zfree(slots);
+                return 1;
+            }
+        }
+        /* Check that the slots are not already busy. */
+        for (j = 2; j < c->argc; j++) {
+            slot = getSlotOrReply(c,c->argv[j]);
+            if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) {
+                zfree(slots);
+                return 1;
+            }
+        }
+        clusterUpdateSlots(c, slots, del);
+        zfree(slots);
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+        addReply(c,shared.ok);
+    } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") ||
+               !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) {
+        if (c->argc % 2 == 1) {
+            addReplyErrorArity(c);
+            return 1;
+        }
+        /* CLUSTER ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] */
+        /* CLUSTER DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] */
+        int j, startslot, endslot;
+        unsigned char *slots = zmalloc(CLUSTER_SLOTS);
+        int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange");
+
+        memset(slots,0,CLUSTER_SLOTS);
+        /* Check that all the arguments are parseable and that all the
+         * slots are not already busy. */
+        for (j = 2; j < c->argc; j += 2) {
+            if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
+                zfree(slots);
+                return 1;
+            }
+            if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) {
+                zfree(slots);
+                return 1;
+            }
+            if (startslot > endslot) {
+                addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot);
+                zfree(slots);
+                return 1;
+            }
+
+            if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) {
+                zfree(slots);
+                return 1;
+            }
+        }
+        clusterUpdateSlots(c, slots, del);
+        zfree(slots);
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) {
+        /* SETSLOT 10 MIGRATING <node ID> */
+        /* SETSLOT 10 IMPORTING <node ID> */
+        /* SETSLOT 10 STABLE */
+        /* SETSLOT 10 NODE <node ID> */
+        int slot;
+        clusterNode *n;
+
+        if (nodeIsSlave(myself)) {
+            addReplyError(c,"Please use SETSLOT only with masters.");
+            return 1;
+        }
+
+        if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 1;
+
+        if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) {
+            if (server.cluster->slots[slot] != myself) {
+                addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
+                return 1;
+            }
+            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
+            if (n == NULL) {
+                addReplyErrorFormat(c,"I don't know about node %s",
+                    (char*)c->argv[4]->ptr);
+                return 1;
+            }
+            if (nodeIsSlave(n)) {
+                addReplyError(c,"Target node is not a master");
+                return 1;
+            }
+            server.cluster->migrating_slots_to[slot] = n;
+        } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) {
+            if (server.cluster->slots[slot] == myself) {
+                addReplyErrorFormat(c,
+                    "I'm already the owner of hash slot %u",slot);
+                return 1;
+            }
+            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
+            if (n == NULL) {
+                addReplyErrorFormat(c,"I don't know about node %s",
+                    (char*)c->argv[4]->ptr);
+                return 1;
+            }
+            if (nodeIsSlave(n)) {
+                addReplyError(c,"Target node is not a master");
+                return 1;
+            }
+            server.cluster->importing_slots_from[slot] = n;
+        } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) {
+            /* CLUSTER SETSLOT <SLOT> STABLE */
+            server.cluster->importing_slots_from[slot] = NULL;
+            server.cluster->migrating_slots_to[slot] = NULL;
+        } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) {
+            /* CLUSTER SETSLOT <SLOT> NODE <NODE ID> */
+            n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
+            if (!n) {
+                addReplyErrorFormat(c,"Unknown node %s",
+                    (char*)c->argv[4]->ptr);
+                return 1;
+            }
+            if (nodeIsSlave(n)) {
+                addReplyError(c,"Target node is not a master");
+                return 1;
+            }
+            /* If this hash slot was served by 'myself' before to switch
+             * make sure there are no longer local keys for this hash slot. */
+            if (server.cluster->slots[slot] == myself && n != myself) {
+                if (countKeysInSlot(slot) != 0) {
+                    addReplyErrorFormat(c,
+                        "Can't assign hashslot %d to a different node "
+                        "while I still hold keys for this hash slot.", slot);
+                    return 1;
+                }
+            }
+            /* If this slot is in migrating status but we have no keys
+             * for it assigning the slot to another node will clear
+             * the migrating status. */
+            if (countKeysInSlot(slot) == 0 &&
+                server.cluster->migrating_slots_to[slot])
+                server.cluster->migrating_slots_to[slot] = NULL;
+
+            int slot_was_mine = server.cluster->slots[slot] == myself;
+            clusterDelSlot(slot);
+            clusterAddSlot(n,slot);
+
+            /* If we are a master left without slots, we should turn into a
+             * replica of the new master. */
+            if (slot_was_mine &&
+                n != myself &&
+                myself->numslots == 0 &&
+                server.cluster_allow_replica_migration) {
+                serverLog(LL_NOTICE,
+                          "Configuration change detected. Reconfiguring myself "
+                          "as a replica of %.40s (%s)", n->name, n->human_nodename);
+                clusterSetMaster(n);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG |
+                                     CLUSTER_TODO_UPDATE_STATE |
+                                     CLUSTER_TODO_FSYNC_CONFIG);
+            }
+
+            /* If this node was importing this slot, assigning the slot to
+             * itself also clears the importing status. */
+            if (n == myself &&
+                server.cluster->importing_slots_from[slot]) {
+                /* This slot was manually migrated, set this node configEpoch
+                 * to a new epoch so that the new version can be propagated
+                 * by the cluster.
+                 *
+                 * Note that if this ever results in a collision with another
+                 * node getting the same configEpoch, for example because a
+                 * failover happens at the same time we close the slot, the
+                 * configEpoch collision resolution will fix it assigning
+                 * a different epoch to each node. */
+                if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
+                    serverLog(LL_NOTICE,
+                        "configEpoch updated after importing slot %d", slot);
+                }
+                server.cluster->importing_slots_from[slot] = NULL;
+                /* After importing this slot, let the other nodes know as
+                 * soon as possible. */
+                clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
+            }
+        } else {
+            addReplyError(c,
+                "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP");
+            return 1;
+        }
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE);
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) {
+        /* CLUSTER BUMPEPOCH */
+        int retval = clusterBumpConfigEpochWithoutConsensus();
+        sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n",
+                (retval == C_OK) ? "BUMPED" : "STILL",
+                (unsigned long long) myself->configEpoch);
+        addReplySds(c,reply);
+    } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) {
+        int retval = clusterSaveConfig(1);
+
+        if (retval == 0)
+            addReply(c,shared.ok);
+        else
+            addReplyErrorFormat(c,"error saving the cluster node config: %s",
+                strerror(errno));
+    } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) {
+        /* CLUSTER FORGET <NODE ID> */
+        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
+        if (!n) {
+            if (clusterBlacklistExists((char*)c->argv[2]->ptr))
+                /* Already forgotten. The deletion may have been gossipped by
+                 * another node, so we pretend it succeeded. */
+                addReply(c,shared.ok);
+            else
+                addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+            return 1;
+        } else if (n == myself) {
+            addReplyError(c,"I tried hard but I can't forget myself...");
+            return 1;
+        } else if (nodeIsSlave(myself) && myself->slaveof == n) {
+            addReplyError(c,"Can't forget my master!");
+            return 1;
+        }
+        clusterBlacklistAddNode(n);
+        clusterDelNode(n);
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
+                             CLUSTER_TODO_SAVE_CONFIG);
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) {
+        /* CLUSTER REPLICATE <NODE ID> */
+        /* Lookup the specified node in our table. */
+        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
+        if (!n) {
+            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+            return 1;
+        }
+
+        /* I can't replicate myself. */
+        if (n == myself) {
+            addReplyError(c,"Can't replicate myself");
+            return 1;
+        }
+
+        /* Can't replicate a slave. */
+        if (nodeIsSlave(n)) {
+            addReplyError(c,"I can only replicate a master, not a replica.");
+            return 1;
+        }
+
+        /* If the instance is currently a master, it should have no assigned
+         * slots nor keys to accept to replicate some other node.
+         * Slaves can switch to another master without issues. */
+        if (clusterNodeIsMaster(myself) &&
+            (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) {
+            addReplyError(c,
+                "To set a master the node must be empty and "
+                "without assigned slots.");
+            return 1;
+        }
+
+        /* Set the master. */
+        clusterSetMaster(n);
+        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") &&
+               c->argc == 3)
+    {
+        /* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */
+        clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
+
+        if (!n) {
+            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+            return 1;
+        } else {
+            addReplyLongLong(c,clusterNodeFailureReportsCount(n));
+        }
+    } else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
+               (c->argc == 2 || c->argc == 3))
+    {
+        /* CLUSTER FAILOVER [FORCE|TAKEOVER] */
+        int force = 0, takeover = 0;
+
+        if (c->argc == 3) {
+            if (!strcasecmp(c->argv[2]->ptr,"force")) {
+                force = 1;
+            } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) {
+                takeover = 1;
+                force = 1; /* Takeover also implies force. */
+            } else {
+                addReplyErrorObject(c,shared.syntaxerr);
+                return 1;
+            }
+        }
+
+        /* Check preconditions. */
+        if (clusterNodeIsMaster(myself)) {
+            addReplyError(c,"You should send CLUSTER FAILOVER to a replica");
+            return 1;
+        } else if (myself->slaveof == NULL) {
+            addReplyError(c,"I'm a replica but my master is unknown to me");
+            return 1;
+        } else if (!force &&
+                   (nodeFailed(myself->slaveof) ||
+                    myself->slaveof->link == NULL))
+        {
+            addReplyError(c,"Master is down or failed, "
+                            "please use CLUSTER FAILOVER FORCE");
+            return 1;
+        }
+        resetManualFailover();
+        server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT;
+
+        if (takeover) {
+            /* A takeover does not perform any initial check. It just
+             * generates a new configuration epoch for this node without
+             * consensus, claims the master's slots, and broadcast the new
+             * configuration. */
+            serverLog(LL_NOTICE,"Taking over the master (user request).");
+            clusterBumpConfigEpochWithoutConsensus();
+            clusterFailoverReplaceYourMaster();
+        } else if (force) {
+            /* If this is a forced failover, we don't need to talk with our
+             * master to agree about the offset. We just failover taking over
+             * it without coordination. */
+            serverLog(LL_NOTICE,"Forced failover user request accepted.");
+            server.cluster->mf_can_start = 1;
+        } else {
+            serverLog(LL_NOTICE,"Manual failover user request accepted.");
+            clusterSendMFStart(myself->slaveof);
+        }
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3)
+    {
+        /* CLUSTER SET-CONFIG-EPOCH <epoch>
+         *
+         * The user is allowed to set the config epoch only when a node is
+         * totally fresh: no config epoch, no other known node, and so forth.
+         * This happens at cluster creation time to start with a cluster where
+         * every node has a different node ID, without to rely on the conflicts
+         * resolution system which is too slow when a big cluster is created. */
+        long long epoch;
+
+        if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK)
+            return 1;
+
+        if (epoch < 0) {
+            addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch);
+        } else if (dictSize(server.cluster->nodes) > 1) {
+            addReplyError(c,"The user can assign a config epoch only when the "
+                            "node does not know any other node.");
+        } else if (myself->configEpoch != 0) {
+            addReplyError(c,"Node config epoch is already non-zero");
+        } else {
+            myself->configEpoch = epoch;
+            serverLog(LL_NOTICE,
+                "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH",
+                (unsigned long long) myself->configEpoch);
+
+            if (server.cluster->currentEpoch < (uint64_t)epoch)
+                server.cluster->currentEpoch = epoch;
+            /* No need to fsync the config here since in the unlucky event
+             * of a failure to persist the config, the conflict resolution code
+             * will assign a unique config to this node. */
+            clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
+                                 CLUSTER_TODO_SAVE_CONFIG);
+            addReply(c,shared.ok);
+        }
+    } else if (!strcasecmp(c->argv[1]->ptr,"reset") &&
+               (c->argc == 2 || c->argc == 3))
+    {
+        /* CLUSTER RESET [SOFT|HARD] */
+        int hard = 0;
+
+        /* Parse soft/hard argument. Default is soft. */
+        if (c->argc == 3) {
+            if (!strcasecmp(c->argv[2]->ptr,"hard")) {
+                hard = 1;
+            } else if (!strcasecmp(c->argv[2]->ptr,"soft")) {
+                hard = 0;
+            } else {
+                addReplyErrorObject(c,shared.syntaxerr);
+                return 1;
+            }
+        }
+
+        /* Slaves can be reset while containing data, but not master nodes
+         * that must be empty. */
+        if (clusterNodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) {
+            addReplyError(c,"CLUSTER RESET can't be called with "
+                            "master nodes containing keys");
+            return 1;
+        }
+        clusterReset(hard);
+        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) {
+        /* CLUSTER LINKS */
+        addReplyClusterLinksDescription(c);
+    } else {
+        return 0;
+    }
+
+    return 1;
+}
+
+const char **clusterCommandExtendedHelp(void) {
+    static const char *help[] = {
+        "ADDSLOTS <slot> [<slot> ...]",
+        "    Assign slots to current node.",
+        "ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
+        "    Assign slots which are between <start-slot> and <end-slot> to current node.",
+        "BUMPEPOCH",
+        "    Advance the cluster config epoch.",
+        "COUNT-FAILURE-REPORTS <node-id>",
+        "    Return number of failure reports for <node-id>.",
+        "DELSLOTS <slot> [<slot> ...]",
+        "    Delete slots information from current node.",
+        "DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
+        "    Delete slots information which are between <start-slot> and <end-slot> from current node.",
+        "FAILOVER [FORCE|TAKEOVER]",
+        "    Promote current replica node to being a master.",
+        "FORGET <node-id>",
+        "    Remove a node from the cluster.",
+        "FLUSHSLOTS",
+        "    Delete current node own slots information.",
+        "MEET <ip> <port> [<bus-port>]",
+        "    Connect nodes into a working cluster.",
+        "REPLICATE <node-id>",
+        "    Configure current node as replica to <node-id>.",
+        "RESET [HARD|SOFT]",
+        "    Reset current node (default: soft).",
+        "SET-CONFIG-EPOCH <epoch>",
+        "    Set config epoch of current node.",
+        "SETSLOT <slot> (IMPORTING <node-id>|MIGRATING <node-id>|STABLE|NODE <node-id>)",
+        "    Set slot state.",
+        "SAVECONFIG",
+        "    Force saving cluster configuration on disk.",
+        "LINKS",
+        "    Return information about all network links between this node and its peers.",
+        "    Output format is an array where each array element is a map containing attributes of a link",
+        NULL
+    };
+
+    return help;
+}
+
+int clusterNodeNumSlaves(clusterNode *node) {
+    return node->numslaves;
+}
+
+clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx) {
+    return node->slaves[slave_idx];
+}
+
+clusterNode *getMigratingSlotDest(int slot) {
+    return server.cluster->migrating_slots_to[slot];
+}
+
+clusterNode *getImportingSlotSource(int slot) {
+    return server.cluster->importing_slots_from[slot];
+}
+
+int isClusterHealthy(void) {
+    return server.cluster->state == CLUSTER_OK;
+}
+
+clusterNode *getNodeBySlot(int slot) {
+    return server.cluster->slots[slot];
+}
+
+char *clusterNodeHostname(clusterNode *node) {
+    return node->hostname;
+}
+
+long long clusterNodeReplOffset(clusterNode *node) {
+    return node->repl_offset;
+}
+
+const char *clusterNodePreferredEndpoint(clusterNode *n) {
+    char *hostname = clusterNodeHostname(n);
+    switch (server.cluster_preferred_endpoint_type) {
+        case CLUSTER_ENDPOINT_TYPE_IP:
+            return clusterNodeIp(n);
+        case CLUSTER_ENDPOINT_TYPE_HOSTNAME:
+            return (hostname != NULL && hostname[0] != '\0') ? hostname : "?";
+        case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT:
+            return "";
+    }
+    return "unknown";
+}
+
+int clusterAllowFailoverCmd(client *c) {
+    if (!server.cluster_enabled) {
+        return 1;
+    }
+    addReplyError(c,"FAILOVER not allowed in cluster mode. "
+                    "Use CLUSTER FAILOVER command instead.");
+    return 0;
+}
+
+void clusterPromoteSelfToMaster(void) {
+    replicationUnsetMaster();
+}
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
new file mode 100644
index 000000000..578b46fc3
--- /dev/null
+++ b/src/cluster_legacy.h
@@ -0,0 +1,360 @@
+#ifndef CLUSTER_LEGACY_H
+#define CLUSTER_LEGACY_H
+
+#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
+
+/* The following defines are amount of time, sometimes expressed as
+ * multiplicators of the node timeout value (when ending with MULT). */
+#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
+#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
+#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
+#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
+#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
+
+/* Reasons why a slave is not able to failover. */
+#define CLUSTER_CANT_FAILOVER_NONE 0
+#define CLUSTER_CANT_FAILOVER_DATA_AGE 1
+#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
+#define CLUSTER_CANT_FAILOVER_EXPIRED 3
+#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
+#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */
+
+/* clusterState todo_before_sleep flags. */
+#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
+#define CLUSTER_TODO_UPDATE_STATE (1<<1)
+#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
+#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
+#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4)
+
+/* clusterLink encapsulates everything needed to talk with a remote node. */
+typedef struct clusterLink {
+    mstime_t ctime;             /* Link creation time */
+    connection *conn;           /* Connection to remote node */
+    list *send_msg_queue;        /* List of messages to be sent */
+    size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */
+    unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */
+    char *rcvbuf;               /* Packet reception buffer */
+    size_t rcvbuf_len;          /* Used size of rcvbuf */
+    size_t rcvbuf_alloc;        /* Allocated size of rcvbuf */
+    clusterNode *node;          /* Node related to this link. Initialized to NULL when unknown */
+    int inbound;                /* 1 if this link is an inbound link accepted from the related node */
+} clusterLink;
+
+/* Cluster node flags and macros. */
+#define CLUSTER_NODE_MASTER 1     /* The node is a master */
+#define CLUSTER_NODE_SLAVE 2      /* The node is a slave */
+#define CLUSTER_NODE_PFAIL 4      /* Failure? Need acknowledge */
+#define CLUSTER_NODE_FAIL 8       /* The node is believed to be malfunctioning */
+#define CLUSTER_NODE_MYSELF 16    /* This node is myself */
+#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
+#define CLUSTER_NODE_NOADDR   64  /* We don't know the address of this node */
+#define CLUSTER_NODE_MEET 128     /* Send a MEET message to this node */
+#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */
+#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */
+#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+
+#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
+#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
+#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
+#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
+#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
+#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
+
+/* This structure represent elements of node->fail_reports. */
+typedef struct clusterNodeFailReport {
+    clusterNode *node;         /* Node reporting the failure condition. */
+    mstime_t time;             /* Time of the last report from this node. */
+} clusterNodeFailReport;
+
+/* Redis cluster messages header */
+
+/* Message types.
+ *
+ * Note that the PING, PONG and MEET messages are actually the same exact
+ * kind of packet. PONG is the reply to ping, in the exact format as a PING,
+ * while MEET is a special PING that forces the receiver to add the sender
+ * as a node (if it is not already in the list). */
+#define CLUSTERMSG_TYPE_PING 0          /* Ping */
+#define CLUSTERMSG_TYPE_PONG 1          /* Pong (reply to Ping) */
+#define CLUSTERMSG_TYPE_MEET 2          /* Meet "let's join" message */
+#define CLUSTERMSG_TYPE_FAIL 3          /* Mark node xxx as failing */
+#define CLUSTERMSG_TYPE_PUBLISH 4       /* Pub/Sub Publish propagation */
+#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
+#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6     /* Yes, you have my vote */
+#define CLUSTERMSG_TYPE_UPDATE 7        /* Another node slots configuration */
+#define CLUSTERMSG_TYPE_MFSTART 8       /* Pause clients for manual failover */
+#define CLUSTERMSG_TYPE_MODULE 9        /* Module cluster API message. */
+#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */
+#define CLUSTERMSG_TYPE_COUNT 11        /* Total number of message types. */
+
+/* Initially we don't know our "name", but we'll find it once we connect
+ * to the first node, using the getsockname() function. Then we'll use this
+ * address for all the next messages. */
+typedef struct {
+    char nodename[CLUSTER_NAMELEN];
+    uint32_t ping_sent;
+    uint32_t pong_received;
+    char ip[NET_IP_STR_LEN];  /* IP address last time it was seen */
+    uint16_t port;              /* primary port last time it was seen */
+    uint16_t cport;             /* cluster port last time it was seen */
+    uint16_t flags;             /* node->flags copy */
+    uint16_t pport;             /* secondary port last time it was seen */
+    uint16_t notused1;
+} clusterMsgDataGossip;
+
+typedef struct {
+    char nodename[CLUSTER_NAMELEN];
+} clusterMsgDataFail;
+
+typedef struct {
+    uint32_t channel_len;
+    uint32_t message_len;
+    unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
+} clusterMsgDataPublish;
+
+typedef struct {
+    uint64_t configEpoch; /* Config epoch of the specified instance. */
+    char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
+    unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */
+} clusterMsgDataUpdate;
+
+typedef struct {
+    uint64_t module_id;     /* ID of the sender module. */
+    uint32_t len;           /* ID of the sender module. */
+    uint8_t type;           /* Type from 0 to 255. */
+    unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
+} clusterMsgModule;
+
+/* The cluster supports optional extension messages that can be sent
+ * along with ping/pong/meet messages to give additional info in a
+ * consistent manner. */
+typedef enum {
+    CLUSTERMSG_EXT_TYPE_HOSTNAME,
+    CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME,
+    CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE,
+    CLUSTERMSG_EXT_TYPE_SHARDID,
+} clusterMsgPingtypes;
+
+/* Helper function for making sure extensions are eight byte aligned. */
+#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8)
+
+typedef struct {
+    char hostname[1]; /* The announced hostname, ends with \0. */
+} clusterMsgPingExtHostname;
+
+typedef struct {
+    char human_nodename[1]; /* The announced nodename, ends with \0. */
+} clusterMsgPingExtHumanNodename;
+
+typedef struct {
+    char name[CLUSTER_NAMELEN]; /* Node name. */
+    uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */
+} clusterMsgPingExtForgottenNode;
+
+static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, "");
+
+typedef struct {
+    char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */
+} clusterMsgPingExtShardId;
+
+typedef struct {
+    uint32_t length; /* Total length of this extension message (including this header) */
+    uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */
+    uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */
+    union {
+        clusterMsgPingExtHostname hostname;
+        clusterMsgPingExtHumanNodename human_nodename;
+        clusterMsgPingExtForgottenNode forgotten_node;
+        clusterMsgPingExtShardId shard_id;
+    } ext[]; /* Actual extension information, formatted so that the data is 8
+              * byte aligned, regardless of its content. */
+} clusterMsgPingExt;
+
+union clusterMsgData {
+    /* PING, MEET and PONG */
+    struct {
+        /* Array of N clusterMsgDataGossip structures */
+        clusterMsgDataGossip gossip[1];
+        /* Extension data that can optionally be sent for ping/meet/pong
+         * messages. We can't explicitly define them here though, since
+         * the gossip array isn't the real length of the gossip data. */
+    } ping;
+
+    /* FAIL */
+    struct {
+        clusterMsgDataFail about;
+    } fail;
+
+    /* PUBLISH */
+    struct {
+        clusterMsgDataPublish msg;
+    } publish;
+
+    /* UPDATE */
+    struct {
+        clusterMsgDataUpdate nodecfg;
+    } update;
+
+    /* MODULE */
+    struct {
+        clusterMsgModule msg;
+    } module;
+};
+
+#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
+
+typedef struct {
+    char sig[4];        /* Signature "RCmb" (Redis Cluster message bus). */
+    uint32_t totlen;    /* Total length of this message */
+    uint16_t ver;       /* Protocol version, currently set to 1. */
+    uint16_t port;      /* Primary port number (TCP or TLS). */
+    uint16_t type;      /* Message type */
+    uint16_t count;     /* Only used for some kind of messages. */
+    uint64_t currentEpoch;  /* The epoch accordingly to the sending node. */
+    uint64_t configEpoch;   /* The config epoch if it's a master, or the last
+                               epoch advertised by its master if it is a
+                               slave. */
+    uint64_t offset;    /* Master replication offset if node is a master or
+                           processed replication offset if node is a slave. */
+    char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
+    unsigned char myslots[CLUSTER_SLOTS/8];
+    char slaveof[CLUSTER_NAMELEN];
+    char myip[NET_IP_STR_LEN];    /* Sender IP, if not all zeroed. */
+    uint16_t extensions; /* Number of extensions sent along with this packet. */
+    char notused1[30];   /* 30 bytes reserved for future usage. */
+    uint16_t pport;      /* Secondary port number: if primary port is TCP port, this is
+                            TLS port, and if primary port is TLS port, this is TCP port.*/
+    uint16_t cport;      /* Sender TCP cluster bus port */
+    uint16_t flags;      /* Sender node flags */
+    unsigned char state; /* Cluster state from the POV of the sender */
+    unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
+    union clusterMsgData data;
+} clusterMsg;
+
+/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster
+ * members, which can be running different versions of redis-server bits,
+ * especially during cluster rolling upgrades.
+ *
+ * Therefore, fields in this struct should remain at the same offset from
+ * release to release. The static asserts below ensures that incompatible
+ * changes in clusterMsg be caught at compile time.
+ */
+
+static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset");
+static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset");
+static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset");
+static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset");
+static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset");
+static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset");
+static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset");
+static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset");
+static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset");
+static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset");
+static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset");
+static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset");
+static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset");
+static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset");
+static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset");
+static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset");
+static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset");
+static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset");
+static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset");
+static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset");
+static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset");
+
+#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
+
+/* Message flags better specify the packet content or are used to
+ * provide some information about the node state. */
+#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
+#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
+                                            master is up. */
+#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */
+
+struct _clusterNode {
+    mstime_t ctime; /* Node object creation time. */
+    char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
+    char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */
+    int flags;      /* CLUSTER_NODE_... */
+    uint64_t configEpoch; /* Last configEpoch observed for this node */
+    unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
+    uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */
+    int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */
+    int numslots;   /* Number of slots handled by this node */
+    int numslaves;  /* Number of slave nodes, if this is a master */
+    clusterNode **slaves; /* pointers to slave nodes */
+    clusterNode *slaveof; /* pointer to the master node. Note that it
+                             may be NULL even if the node is a slave
+                             if we don't have the master node in our
+                             tables. */
+    unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */
+    mstime_t ping_sent;      /* Unix time we sent latest ping */
+    mstime_t pong_received;  /* Unix time we received the pong */
+    mstime_t data_received;  /* Unix time we received any data */
+    mstime_t fail_time;      /* Unix time when FAIL flag was set */
+    mstime_t voted_time;     /* Last time we voted for a slave of this master */
+    mstime_t repl_offset_time;  /* Unix time we received offset for this node */
+    mstime_t orphaned_time;     /* Starting time of orphaned master condition */
+    long long repl_offset;      /* Last known repl offset for this node. */
+    char ip[NET_IP_STR_LEN];    /* Latest known IP address of this node */
+    sds hostname;               /* The known hostname for this node */
+    sds human_nodename;         /* The known human readable nodename for this node */
+    int tcp_port;               /* Latest known clients TCP port. */
+    int tls_port;               /* Latest known clients TLS port */
+    int cport;                  /* Latest known cluster port of this node. */
+    clusterLink *link;          /* TCP/IP link established toward this node */
+    clusterLink *inbound_link;  /* TCP/IP link accepted from this node */
+    list *fail_reports;         /* List of nodes signaling this as failing */
+};
+
+struct clusterState {
+    clusterNode *myself;  /* This node */
+    uint64_t currentEpoch;
+    int state;            /* CLUSTER_OK, CLUSTER_FAIL, ... */
+    int size;             /* Num of master nodes with at least one slot */
+    dict *nodes;          /* Hash table of name -> clusterNode structures */
+    dict *shards;         /* Hash table of shard_id -> list (of nodes) structures */
+    dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
+    clusterNode *migrating_slots_to[CLUSTER_SLOTS];
+    clusterNode *importing_slots_from[CLUSTER_SLOTS];
+    clusterNode *slots[CLUSTER_SLOTS];
+    rax *slots_to_channels;
+    /* The following fields are used to take the slave state on elections. */
+    mstime_t failover_auth_time; /* Time of previous or next election. */
+    int failover_auth_count;    /* Number of votes received so far. */
+    int failover_auth_sent;     /* True if we already asked for votes. */
+    int failover_auth_rank;     /* This slave rank for current auth request. */
+    uint64_t failover_auth_epoch; /* Epoch of the current election. */
+    int cant_failover_reason;   /* Why a slave is currently not able to
+                                   failover. See the CANT_FAILOVER_* macros. */
+    /* Manual failover state in common. */
+    mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
+                                   It is zero if there is no MF in progress. */
+    /* Manual failover state of master. */
+    clusterNode *mf_slave;      /* Slave performing the manual failover. */
+    /* Manual failover state of slave. */
+    long long mf_master_offset; /* Master offset the slave needs to start MF
+                                   or -1 if still not received. */
+    int mf_can_start;           /* If non-zero signal that the manual failover
+                                   can start requesting masters vote. */
+    /* The following fields are used by masters to take state on elections. */
+    uint64_t lastVoteEpoch;     /* Epoch of the last vote granted. */
+    int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
+    /* Stats */
+    /* Messages received and sent by type. */
+    long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
+    long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
+    long long stats_pfail_nodes;    /* Number of nodes in PFAIL status,
+                                       excluding nodes without address. */
+    unsigned long long stat_cluster_links_buffer_limit_exceeded;  /* Total number of cluster links freed due to exceeding buffer limit */
+
+    /* Bit map for slots that are no longer claimed by the owner in cluster PING
+     * messages. During slot migration, the owner will stop claiming the slot after
+     * the ownership transfer. Set the bit corresponding to the slot when a node
+     * stops claiming the slot. This prevents spreading incorrect information (that
+     * source still owns the slot) using UPDATE messages. */
+    unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8];
+};
+
+
+#endif //CLUSTER_LEGACY_H
diff --git a/src/db.c b/src/db.c
index c4c22907e..a369c9a9a 100644
--- a/src/db.c
+++ b/src/db.c
@@ -2197,7 +2197,7 @@ int dbExpand(const redisDb *db, uint64_t db_size, dbKeyType keyType, int try_exp
     dict *d;
     if (server.cluster_enabled) {
         for (int i = 0; i < CLUSTER_SLOTS; i++) {
-            if (clusterNodeGetSlotBit(server.cluster->myself, i)) {
+            if (clusterNodeCoversSlot(getMyClusterNode(), i)) {
                 /* We don't know exact number of keys that would fall into each slot, but we can approximate it, assuming even distribution. */ 
                 if (keyType == DB_MAIN) {
                     d = db->dict[i];
diff --git a/src/debug.c b/src/debug.c
index 1d1176359..26cb98f1e 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -496,11 +496,9 @@ void debugCommand(client *c) {
 "    In case RESET is provided the peak reset time will be restored to the default value",
 "REPLYBUFFER RESIZING <0|1>",
 "    Enable or disable the reply buffer resize cron job",
-"CLUSTERLINK KILL <to|from|all> <node-id>",
-"    Kills the link based on the direction to/from (both) with the provided node." ,
 NULL
         };
-        addReplyHelp(c, help);
+        addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp());
     } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
         /* Compiler gives warnings about writing to a random address
          * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area
@@ -1023,34 +1021,7 @@ NULL
             return;
         }
         addReply(c, shared.ok);
-    } else if(!strcasecmp(c->argv[1]->ptr,"CLUSTERLINK") &&
-        !strcasecmp(c->argv[2]->ptr,"KILL") &&
-        c->argc == 5) {
-        if (!server.cluster_enabled) {
-            addReplyError(c, "Debug option only available for cluster mode enabled setup!");
-            return;
-        }
-
-        /* Find the node. */
-        clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr));
-        if (!n) {
-            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[4]->ptr);
-            return;
-        }
-
-        /* Terminate the link based on the direction or all. */
-        if (!strcasecmp(c->argv[3]->ptr,"from")) {
-            freeClusterLink(n->inbound_link);
-        } else if (!strcasecmp(c->argv[3]->ptr,"to")) {
-            freeClusterLink(n->link);
-        } else if (!strcasecmp(c->argv[3]->ptr,"all")) {
-            freeClusterLink(n->link);
-            freeClusterLink(n->inbound_link);
-        } else {
-            addReplyErrorFormat(c, "Unknown direction %s", (char*) c->argv[3]->ptr);
-        }
-        addReply(c,shared.ok);
-    } else {
+    } else if(!handleDebugClusterCommand(c)) {
         addReplySubcommandSyntaxError(c);
         return;
     }
diff --git a/src/module.c b/src/module.c
index 0428ac59c..b33192e08 100644
--- a/src/module.c
+++ b/src/module.c
@@ -6466,7 +6466,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch
         c->flags &= ~(CLIENT_READONLY|CLIENT_ASKING);
         c->flags |= ctx->client->flags & (CLIENT_READONLY|CLIENT_ASKING);
         if (getNodeByQuery(c,c->cmd,c->argv,c->argc,NULL,&error_code) !=
-                           server.cluster->myself)
+                           getMyClusterNode())
         {
             sds msg = NULL;
             if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) {
@@ -8917,23 +8917,7 @@ char **RM_GetClusterNodesList(RedisModuleCtx *ctx, size_t *numnodes) {
     UNUSED(ctx);
 
     if (!server.cluster_enabled) return NULL;
-    size_t count = dictSize(server.cluster->nodes);
-    char **ids = zmalloc((count+1)*REDISMODULE_NODE_ID_LEN);
-    dictIterator *di = dictGetIterator(server.cluster->nodes);
-    dictEntry *de;
-    int j = 0;
-    while((de = dictNext(di)) != NULL) {
-        clusterNode *node = dictGetVal(de);
-        if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue;
-        ids[j] = zmalloc(REDISMODULE_NODE_ID_LEN);
-        memcpy(ids[j],node->name,REDISMODULE_NODE_ID_LEN);
-        j++;
-    }
-    *numnodes = j;
-    ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need
-                    * to also get the count argument. */
-    dictReleaseIterator(di);
-    return ids;
+    return getClusterNodesList(numnodes);
 }
 
 /* Free the node list obtained with RedisModule_GetClusterNodesList. */
@@ -8947,7 +8931,7 @@ void RM_FreeClusterNodesList(char **ids) {
  * is disabled. */
 const char *RM_GetMyClusterID(void) {
     if (!server.cluster_enabled) return NULL;
-    return server.cluster->myself->name;
+    return getMyClusterId();
 }
 
 /* Return the number of nodes in the cluster, regardless of their state
@@ -8956,7 +8940,7 @@ const char *RM_GetMyClusterID(void) {
  * cluster mode, zero is returned. */
 size_t RM_GetClusterSize(void) {
     if (!server.cluster_enabled) return 0;
-    return dictSize(server.cluster->nodes);
+    return getClusterSize();
 }
 
 /* Populate the specified info for the node having as ID the specified 'id',
@@ -8983,20 +8967,19 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m
     UNUSED(ctx);
 
     clusterNode *node = clusterLookupNode(id, strlen(id));
-    if (node == NULL ||
-        node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
+    if (node == NULL || clusterNodePending(node))
     {
         return REDISMODULE_ERR;
     }
 
-    if (ip) redis_strlcpy(ip,node->ip,NET_IP_STR_LEN);
+    if (ip) redis_strlcpy(ip, clusterNodeIp(node),NET_IP_STR_LEN);
 
     if (master_id) {
         /* If the information is not available, the function will set the
          * field to zero bytes, so that when the field can't be populated the
          * function kinda remains predictable. */
-        if (node->flags & CLUSTER_NODE_SLAVE && node->slaveof)
-            memcpy(master_id,node->slaveof->name,REDISMODULE_NODE_ID_LEN);
+        if (clusterNodeIsSlave(node) && clusterNodeGetSlaveof(node))
+            memcpy(master_id, clusterNodeGetName(clusterNodeGetSlaveof(node)) ,REDISMODULE_NODE_ID_LEN);
         else
             memset(master_id,0,REDISMODULE_NODE_ID_LEN);
     }
@@ -9006,12 +8989,12 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m
      * we can provide binary compatibility. */
     if (flags) {
         *flags = 0;
-        if (node->flags & CLUSTER_NODE_MYSELF) *flags |= REDISMODULE_NODE_MYSELF;
-        if (node->flags & CLUSTER_NODE_MASTER) *flags |= REDISMODULE_NODE_MASTER;
-        if (node->flags & CLUSTER_NODE_SLAVE) *flags |= REDISMODULE_NODE_SLAVE;
-        if (node->flags & CLUSTER_NODE_PFAIL) *flags |= REDISMODULE_NODE_PFAIL;
-        if (node->flags & CLUSTER_NODE_FAIL) *flags |= REDISMODULE_NODE_FAIL;
-        if (node->flags & CLUSTER_NODE_NOFAILOVER) *flags |= REDISMODULE_NODE_NOFAILOVER;
+        if (clusterNodeIsMyself(node)) *flags |= REDISMODULE_NODE_MYSELF;
+        if (clusterNodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER;
+        if (clusterNodeIsSlave(node)) *flags |= REDISMODULE_NODE_SLAVE;
+        if (clusterNodeTimedOut(node)) *flags |= REDISMODULE_NODE_PFAIL;
+        if (clusterNodeIsFailing(node)) *flags |= REDISMODULE_NODE_FAIL;
+        if (clusterNodeIsNoFailover(node)) *flags |= REDISMODULE_NODE_NOFAILOVER;
     }
     return REDISMODULE_OK;
 }
diff --git a/src/networking.c b/src/networking.c
index 718271aec..847eee3d5 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -1117,14 +1117,18 @@ void addReplyVerbatim(client *c, const char *s, size_t len, const char *ext) {
     }
 }
 
-/* Add an array of C strings as status replies with a heading.
- * This function is typically invoked by from commands that support
- * subcommands in response to the 'help' subcommand. The help array
- * is terminated by NULL sentinel. */
-void addReplyHelp(client *c, const char **help) {
+/* This function is similar to the addReplyHelp function but adds the
+ * ability to pass in two arrays of strings. Some commands have
+ * some additional subcommands based on the specific feature implementation
+ * Redis is compiled with (currently just clustering). This function allows
+ * to pass is the common subcommands in `help` and any implementation
+ * specific subcommands in `extended_help`.
+ */
+void addExtendedReplyHelp(client *c, const char **help, const char **extended_help) {
     sds cmd = sdsnew((char*) c->argv[0]->ptr);
     void *blenp = addReplyDeferredLen(c);
     int blen = 0;
+    int idx = 0;
 
     sdstoupper(cmd);
     addReplyStatusFormat(c,
@@ -1132,6 +1136,10 @@ void addReplyHelp(client *c, const char **help) {
     sdsfree(cmd);
 
     while (help[blen]) addReplyStatus(c,help[blen++]);
+    if (extended_help) {
+        while (extended_help[idx]) addReplyStatus(c,extended_help[idx++]);
+    }
+    blen += idx;
 
     addReplyStatus(c,"HELP");
     addReplyStatus(c,"    Print this help.");
@@ -1141,6 +1149,14 @@ void addReplyHelp(client *c, const char **help) {
     setDeferredArrayLen(c,blenp,blen);
 }
 
+/* Add an array of C strings as status replies with a heading.
+ * This function is typically invoked by commands that support
+ * subcommands in response to the 'help' subcommand. The help array
+ * is terminated by NULL sentinel. */
+void addReplyHelp(client *c, const char **help) {
+    addExtendedReplyHelp(c, help, NULL);
+}
+
 /* Add a suggestive error reply.
  * This function is typically invoked by from commands that support
  * subcommands in response to an unknown subcommand or argument error. */
diff --git a/src/replication.c b/src/replication.c
index 313f69152..e4b7c42e1 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -951,7 +951,11 @@ void syncCommand(client *c) {
         }
 
         if (!strcasecmp(c->argv[1]->ptr,server.replid)) {
-            replicationUnsetMaster();
+            if (server.cluster_enabled) {
+                clusterPromoteSelfToMaster();
+            } else {
+                replicationUnsetMaster();
+            }
             sds client = catClientInfoString(sdsempty(),c);
             serverLog(LL_NOTICE,
                 "MASTER MODE enabled (failover request from '%s')",client);
@@ -3774,7 +3778,7 @@ void replicationCron(void) {
          * match the one stored into 'mf_master_offset' state. */
         int manual_failover_in_progress =
             ((server.cluster_enabled &&
-              server.cluster->mf_end) ||
+              clusterManualFailoverTimeLimit()) ||
             server.failover_end_time) &&
             isPausedActionsWithUpdate(PAUSE_ACTION_REPLICA);
 
@@ -4061,12 +4065,10 @@ void abortFailover(const char *err) {
  * will attempt forever and must be manually aborted.
  */
 void failoverCommand(client *c) {
-    if (server.cluster_enabled) {
-        addReplyError(c,"FAILOVER not allowed in cluster mode. "
-                        "Use CLUSTER FAILOVER command instead.");
+    if (!clusterAllowFailoverCmd(c)) {
         return;
     }
-    
+
     /* Handle special case for abort */
     if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) {
         if (server.failover_state == NO_FAILOVER) {
diff --git a/src/script.c b/src/script.c
index d0b9b9635..678773d96 100644
--- a/src/script.c
+++ b/src/script.c
@@ -429,7 +429,7 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or
     c->flags &= ~(CLIENT_READONLY | CLIENT_ASKING);
     c->flags |= original_c->flags & (CLIENT_READONLY | CLIENT_ASKING);
     int hashslot = -1;
-    if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != server.cluster->myself) {
+    if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != getMyClusterNode()) {
         if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) {
             *err = sdsnew(
                     "Script attempted to execute a write command while the "
diff --git a/src/server.c b/src/server.c
index e63a2ffff..454bd969d 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4037,7 +4037,7 @@ int processCommand(client *c) {
         int error_code;
         clusterNode *n = getNodeByQuery(c,c->cmd,c->argv,c->argc,
                                         &c->slot,&error_code);
-        if (n == NULL || n != server.cluster->myself) {
+        if (n == NULL || !clusterNodeIsMyself(n)) {
             if (c->cmd->proc == execCommand) {
                 discardTransaction(c);
             } else {
@@ -6838,7 +6838,7 @@ int redisIsSupervised(int mode) {
 
 int iAmMaster(void) {
     return ((!server.cluster_enabled && server.masterhost == NULL) ||
-            (server.cluster_enabled && nodeIsMaster(server.cluster->myself)));
+            (server.cluster_enabled && clusterNodeIsMaster(getMyClusterNode())));
 }
 
 #ifdef REDIS_TEST
@@ -7161,7 +7161,7 @@ int main(int argc, char **argv) {
     ACLLoadUsersAtStartup();
     initListeners();
     if (server.cluster_enabled) {
-        clusterInitListeners();
+        clusterInitLast();
     }
     InitServerLast();
 
diff --git a/src/server.h b/src/server.h
index 902050889..66fb288aa 100644
--- a/src/server.h
+++ b/src/server.h
@@ -738,6 +738,7 @@ struct RedisModuleCtx;
 struct moduleLoadQueueEntry;
 struct RedisModuleKeyOptCtx;
 struct RedisModuleCommand;
+struct clusterState;
 
 /* Each module type implementation should export a set of methods in order
  * to serialize and deserialize the value in the RDB file, rewrite the AOF
@@ -2626,6 +2627,7 @@ void addReplySetLen(client *c, long length);
 void addReplyAttributeLen(client *c, long length);
 void addReplyPushLen(client *c, long length);
 void addReplyHelp(client *c, const char **help);
+void addExtendedReplyHelp(client *c, const char **help, const char **extended_help);
 void addReplySubcommandSyntaxError(client *c);
 void addReplyLoadedModules(client *c);
 void copyReplicaOutputBuffer(client *dst, client *src);