diff --git a/src/Makefile b/src/Makefile index c28f6f2b9..75d18f5e6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -345,7 +345,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/cluster.c b/src/cluster.c index a4e9080b3..74d6a4d6d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1,1341 +1,8 @@ -/* Redis Cluster implementation. - * - * Copyright (c) 2009-2012, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - #include "server.h" #include "cluster.h" -#include "endianconv.h" -#include "connection.h" -#include -#include -#include -#include -#include -#include -#include -#include #include -/* A global reference to myself is handy to make code more clear. - * Myself always points to server.cluster->myself, that is, the clusterNode - * that represents this node. */ -clusterNode *myself = NULL; - -clusterNode *createClusterNode(char *nodename, int flags); -void clusterAddNode(clusterNode *node); -void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); -void clusterReadHandler(connection *conn); -void clusterSendPing(clusterLink *link, int type); -void clusterSendFail(char *nodename); -void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); -void clusterUpdateState(void); -int clusterNodeGetSlotBit(clusterNode *n, int slot); -list *clusterGetNodesInMyShard(clusterNode *node); -int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); -int clusterAddSlot(clusterNode *n, int slot); -int clusterDelSlot(int slot); -int clusterDelNodeSlots(clusterNode *node); -int clusterNodeSetSlotBit(clusterNode *n, int slot); -void clusterSetMaster(clusterNode *n); -void clusterHandleSlaveFailover(void); -void clusterHandleSlaveMigration(int max_slaves); -int bitmapTestBit(unsigned char *bitmap, int pos); -void bitmapSetBit(unsigned char *bitmap, int pos); -void bitmapClearBit(unsigned char *bitmap, int pos); -void clusterDoBeforeSleep(int flags); -void clusterSendUpdate(clusterLink *link, clusterNode *node); -void resetManualFailover(void); -void clusterCloseAllSlots(void); -void clusterSetNodeAsMaster(clusterNode *n); -void clusterDelNode(clusterNode *delnode); -sds representClusterNodeFlags(sds ci, uint16_t flags); -sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count); -void clusterFreeNodesSlotsInfo(clusterNode *n); -uint64_t clusterGetMaxEpoch(void); -int clusterBumpConfigEpochWithoutConsensus(void); -void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); -const char *clusterGetMessageTypeString(int type); -void removeChannelsInSlot(unsigned int slot); -unsigned int countKeysInSlot(unsigned int hashslot); -unsigned int countChannelsInSlot(unsigned int hashslot); -unsigned int delKeysInSlot(unsigned int hashslot); -void clusterAddNodeToShard(const char *shard_id, clusterNode *node); -list *clusterLookupNodeListByShardId(const char *shard_id); -void clusterRemoveNodeFromShard(clusterNode *node); -int auxShardIdSetter(clusterNode *n, void *value, int length); -sds auxShardIdGetter(clusterNode *n, sds s); -int auxShardIdPresent(clusterNode *n); -int auxHumanNodenameSetter(clusterNode *n, void *value, int length); -sds auxHumanNodenameGetter(clusterNode *n, sds s); -int auxHumanNodenamePresent(clusterNode *n); -int auxTcpPortSetter(clusterNode *n, void *value, int length); -sds auxTcpPortGetter(clusterNode *n, sds s); -int auxTcpPortPresent(clusterNode *n); -int auxTlsPortSetter(clusterNode *n, void *value, int length); -sds auxTlsPortGetter(clusterNode *n, sds s); -int auxTlsPortPresent(clusterNode *n); -static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); - -int getNodeDefaultClientPort(clusterNode *n) { - return server.tls_cluster ? n->tls_port : n->tcp_port; -} - -static inline int getNodeDefaultReplicationPort(clusterNode *n) { - return server.tls_replication ? n->tls_port : n->tcp_port; -} - -static inline int getNodeClientPort(clusterNode *n, int use_tls) { - return use_tls ? n->tls_port : n->tcp_port; -} - -static inline int defaultClientPort(void) { - return server.tls_cluster ? server.tls_port : server.port; -} - -/* When a cluster command is called, we need to decide whether to return TLS info or - * non-TLS info by the client's connection type. However if the command is called by - * a Lua script or RM_call, there is no connection in the fake client, so we use - * server.current_client here to get the real client if available. And if it is not - * available (modules may call commands without a real client), we return the default - * info, which is determined by server.tls_cluster. */ -static int shouldReturnTlsInfo(void) { - if (server.current_client && server.current_client->conn) { - return connIsTLS(server.current_client->conn); - } else { - return server.tls_cluster; - } -} - -/* Links to the next and previous entries for keys in the same slot are stored - * in the dict entry metadata. See Slot to Key API below. */ -#define dictEntryNextInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->next) -#define dictEntryPrevInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev) - -#define isSlotUnclaimed(slot) \ - (server.cluster->slots[slot] == NULL || \ - bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) - -#define RCVBUF_INIT_LEN 1024 -#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */ - -/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to - * clusterNode structures. */ -dictType clusterNodesDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; - -/* Cluster re-addition blacklist. This maps node IDs to the time - * we can re-add this node. The goal is to avoid reading a removed - * node for some time. */ -dictType clusterNodesBlackListDictType = { - dictSdsCaseHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCaseCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; - -static ConnectionType *connTypeOfCluster(void) { - if (server.tls_cluster) { - return connectionTypeTls(); - } - - return connectionTypeTcp(); -} -/* Cluster shards hash table, mapping shard id to list of nodes */ -dictType clusterSdsToListType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - dictListDestructor, /* val destructor */ - NULL /* allow to expand */ -}; - -/* Aux fields are introduced in Redis 7.2 to support the persistence - * of various important node properties, such as shard id, in nodes.conf. - * Aux fields take an explicit format of name=value pairs and have no - * intrinsic order among them. Aux fields are always grouped together - * at the end of the second column of each row after the node's IP - * address/port/cluster_port and the optional hostname. Aux fields - * are separated by ','. */ - -/* Aux field setter function prototype - * return C_OK when the update is successful; C_ERR otherwise */ -typedef int (aux_value_setter) (clusterNode* n, void *value, int length); -/* Aux field getter function prototype - * return an sds that is a concatenation of the input sds string and - * the aux value */ -typedef sds (aux_value_getter) (clusterNode* n, sds s); - -typedef int (aux_value_present) (clusterNode* n); - -typedef struct { - char *field; - aux_value_setter *setter; - aux_value_getter *getter; - aux_value_present *isPresent; -} auxFieldHandler; - -/* Assign index to each aux field */ -typedef enum { - af_shard_id, - af_human_nodename, - af_tcp_port, - af_tls_port, - af_count, -} auxFieldIndex; - -/* Note that - * 1. the order of the elements below must match that of their - * indices as defined in auxFieldIndex - * 2. aux name can contain characters that pass the isValidAuxChar check only */ -auxFieldHandler auxFieldHandlers[] = { - {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent}, - {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent}, - {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent}, - {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent}, -}; - -int isValidAuxChar(int c) { - return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); -} - -int isValidAuxString(char *s, unsigned int length) { - for (unsigned i = 0; i < length; i++) { - if (!isValidAuxChar(s[i])) return 0; - } - return 1; -} - -int auxShardIdSetter(clusterNode *n, void *value, int length) { - if (verifyClusterNodeId(value, length) == C_ERR) { - return C_ERR; - } - memcpy(n->shard_id, value, CLUSTER_NAMELEN); - /* if n already has replicas, make sure they all agree - * on the shard id */ - for (int i = 0; i < n->numslaves; i++) { - if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { - return C_ERR; - } - } - clusterAddNodeToShard(value, n); - return C_OK; -} - -sds auxShardIdGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%.40s", n->shard_id); -} - -int auxShardIdPresent(clusterNode *n) { - return strlen(n->shard_id); -} - -int auxHumanNodenameSetter(clusterNode *n, void *value, int length) { - if (n && !strncmp(value, n->human_nodename, length)) { - return C_OK; - } else if (!n && (length == 0)) { - return C_OK; - } - if (n) { - n->human_nodename = sdscpylen(n->human_nodename, value, length); - } else if (sdslen(n->human_nodename) != 0) { - sdsclear(n->human_nodename); - } else { - return C_ERR; - } - return C_OK; -} - -sds auxHumanNodenameGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%s", n->human_nodename); -} - -int auxHumanNodenamePresent(clusterNode *n) { - return sdslen(n->human_nodename); -} - -int auxTcpPortSetter(clusterNode *n, void *value, int length) { - if (length > 5 || length < 1) { - return C_ERR; - } - char buf[length + 1]; - memcpy(buf, (char*)value, length); - buf[length] = '\0'; - n->tcp_port = atoi(buf); - return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK; -} - -sds auxTcpPortGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%d", n->tcp_port); -} - -int auxTcpPortPresent(clusterNode *n) { - return n->tcp_port >= 0 && n->tcp_port < 65536; -} - -int auxTlsPortSetter(clusterNode *n, void *value, int length) { - if (length > 5 || length < 1) { - return C_ERR; - } - char buf[length + 1]; - memcpy(buf, (char*)value, length); - buf[length] = '\0'; - n->tls_port = atoi(buf); - return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK; -} - -sds auxTlsPortGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%d", n->tls_port); -} - -int auxTlsPortPresent(clusterNode *n) { - return n->tls_port >= 0 && n->tls_port < 65536; -} - -/* clusterLink send queue blocks */ -typedef struct { - size_t totlen; /* Total length of this block including the message */ - int refcount; /* Number of cluster link send msg queues containing the message */ - clusterMsg msg; -} clusterMsgSendBlock; - -/* ----------------------------------------------------------------------------- - * Initialization - * -------------------------------------------------------------------------- */ - -/* Load the cluster config from 'filename'. - * - * If the file does not exist or is zero-length (this may happen because - * when we lock the nodes.conf file, we create a zero-length one for the - * sake of locking if it does not already exist), C_ERR is returned. - * If the configuration was loaded from the file, C_OK is returned. */ -int clusterLoadConfig(char *filename) { - FILE *fp = fopen(filename,"r"); - struct stat sb; - char *line; - int maxline, j; - - if (fp == NULL) { - if (errno == ENOENT) { - return C_ERR; - } else { - serverLog(LL_WARNING, - "Loading the cluster node config from %s: %s", - filename, strerror(errno)); - exit(1); - } - } - - if (redis_fstat(fileno(fp),&sb) == -1) { - serverLog(LL_WARNING, - "Unable to obtain the cluster node config file stat %s: %s", - filename, strerror(errno)); - exit(1); - } - /* Check if the file is zero-length: if so return C_ERR to signal - * we have to write the config. */ - if (sb.st_size == 0) { - fclose(fp); - return C_ERR; - } - - /* Parse the file. Note that single lines of the cluster config file can - * be really long as they include all the hash slots of the node. - * This means in the worst possible case, half of the Redis slots will be - * present in a single line, possibly in importing or migrating state, so - * together with the node ID of the sender/receiver. - * - * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */ - maxline = 1024+CLUSTER_SLOTS*128; - line = zmalloc(maxline); - while(fgets(line,maxline,fp) != NULL) { - int argc, aux_argc; - sds *argv, *aux_argv; - clusterNode *n, *master; - char *p, *s; - - /* Skip blank lines, they can be created either by users manually - * editing nodes.conf or by the config writing process if stopped - * before the truncate() call. */ - if (line[0] == '\n' || line[0] == '\0') continue; - - /* Split the line into arguments for processing. */ - argv = sdssplitargs(line,&argc); - if (argv == NULL) goto fmterr; - - /* Handle the special "vars" line. Don't pretend it is the last - * line even if it actually is when generated by Redis. */ - if (strcasecmp(argv[0],"vars") == 0) { - if (!(argc % 2)) goto fmterr; - for (j = 1; j < argc; j += 2) { - if (strcasecmp(argv[j],"currentEpoch") == 0) { - server.cluster->currentEpoch = - strtoull(argv[j+1],NULL,10); - } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) { - server.cluster->lastVoteEpoch = - strtoull(argv[j+1],NULL,10); - } else { - serverLog(LL_NOTICE, - "Skipping unknown cluster config variable '%s'", - argv[j]); - } - } - sdsfreesplitres(argv,argc); - continue; - } - - /* Regular config lines have at least eight fields */ - if (argc < 8) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Create this node if it does not exist */ - if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - n = clusterLookupNode(argv[0], sdslen(argv[0])); - if (!n) { - n = createClusterNode(argv[0],0); - clusterAddNode(n); - } - /* Format for the node address and auxiliary argument information: - * ip:port[@cport][,hostname][,aux=val]*] */ - - aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc); - if (aux_argv == NULL) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Hostname is an optional argument that defines the endpoint - * that can be reported to clients instead of IP. */ - if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) { - n->hostname = sdscpy(n->hostname, aux_argv[1]); - } else if (sdslen(n->hostname) != 0) { - sdsclear(n->hostname); - } - - /* All fields after hostname are auxiliary and they take on - * the format of "aux=val" where both aux and val can contain - * characters that pass the isValidAuxChar check only. The order - * of the aux fields is insignificant. */ - int aux_tcp_port = 0; - int aux_tls_port = 0; - for (int i = 2; i < aux_argc; i++) { - int field_argc; - sds *field_argv; - field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc); - if (field_argv == NULL || field_argc != 2) { - /* Invalid aux field format */ - if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Validate that both aux and value contain valid characters only */ - for (unsigned j = 0; j < 2; j++) { - if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){ - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - } - - /* Note that we don't expect lots of aux fields in the foreseeable - * future so a linear search is completely fine. */ - int field_found = 0; - for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) { - if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) || - memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) { - continue; - } - field_found = 1; - aux_tcp_port |= j == af_tcp_port; - aux_tls_port |= j == af_tls_port; - if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) { - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - } - - if (field_found == 0) { - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - - sdsfreesplitres(field_argv, field_argc); - } - /* Address and port */ - if ((p = strrchr(aux_argv[0],':')) == NULL) { - sdsfreesplitres(aux_argv, aux_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - *p = '\0'; - memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1); - char *port = p+1; - char *busp = strchr(port,'@'); - if (busp) { - *busp = '\0'; - busp++; - } - /* If neither TCP or TLS port is found in aux field, it is considered - * an old version of nodes.conf file.*/ - if (!aux_tcp_port && !aux_tls_port) { - if (server.tls_cluster) { - n->tls_port = atoi(port); - } else { - n->tcp_port = atoi(port); - } - } else if (!aux_tcp_port) { - n->tcp_port = atoi(port); - } else if (!aux_tls_port) { - n->tls_port = atoi(port); - } - /* In older versions of nodes.conf the "@busport" part is missing. - * In this case we set it to the default offset of 10000 from the - * base port. */ - n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR); - - /* The plaintext port for client in a TLS cluster (n->pport) is not - * stored in nodes.conf. It is received later over the bus protocol. */ - - sdsfreesplitres(aux_argv, aux_argc); - - /* Parse flags */ - p = s = argv[2]; - while(p) { - p = strchr(s,','); - if (p) *p = '\0'; - if (!strcasecmp(s,"myself")) { - serverAssert(server.cluster->myself == NULL); - myself = server.cluster->myself = n; - n->flags |= CLUSTER_NODE_MYSELF; - } else if (!strcasecmp(s,"master")) { - n->flags |= CLUSTER_NODE_MASTER; - } else if (!strcasecmp(s,"slave")) { - n->flags |= CLUSTER_NODE_SLAVE; - } else if (!strcasecmp(s,"fail?")) { - n->flags |= CLUSTER_NODE_PFAIL; - } else if (!strcasecmp(s,"fail")) { - n->flags |= CLUSTER_NODE_FAIL; - n->fail_time = mstime(); - } else if (!strcasecmp(s,"handshake")) { - n->flags |= CLUSTER_NODE_HANDSHAKE; - } else if (!strcasecmp(s,"noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; - } else if (!strcasecmp(s,"nofailover")) { - n->flags |= CLUSTER_NODE_NOFAILOVER; - } else if (!strcasecmp(s,"noflags")) { - /* nothing to do */ - } else { - serverPanic("Unknown flag in redis cluster config file"); - } - if (p) s = p+1; - } - - /* Get master if any. Set the master and populate master's - * slave list. */ - if (argv[3][0] != '-') { - if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - master = clusterLookupNode(argv[3], sdslen(argv[3])); - if (!master) { - master = createClusterNode(argv[3],0); - clusterAddNode(master); - } - /* shard_id can be absent if we are loading a nodes.conf generated - * by an older version of Redis; we should follow the primary's - * shard_id in this case */ - if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { - memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(master->shard_id, n); - } else if (clusterGetNodesInMyShard(master) != NULL && - memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) - { - /* If the primary has been added to a shard, make sure this - * node has the same persisted shard id as the primary. */ - goto fmterr; - } - n->slaveof = master; - clusterNodeAddSlave(master,n); - } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { - /* n is a primary but it does not have a persisted shard_id. - * This happens if we are loading a nodes.conf generated by - * an older version of Redis. We should manually update the - * shard membership in this case */ - clusterAddNodeToShard(n->shard_id, n); - } - - /* Set ping sent / pong received timestamps */ - if (atoi(argv[4])) n->ping_sent = mstime(); - if (atoi(argv[5])) n->pong_received = mstime(); - - /* Set configEpoch for this node. - * If the node is a replica, set its config epoch to 0. - * If it's a primary, load the config epoch from the configuration file. */ - n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10); - - /* Populate hash slots served by this instance. */ - for (j = 8; j < argc; j++) { - int start, stop; - - if (argv[j][0] == '[') { - /* Here we handle migrating / importing slots */ - int slot; - char direction; - clusterNode *cn; - - p = strchr(argv[j],'-'); - serverAssert(p != NULL); - *p = '\0'; - direction = p[1]; /* Either '>' or '<' */ - slot = atoi(argv[j]+1); - if (slot < 0 || slot >= CLUSTER_SLOTS) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - p += 3; - - char *pr = strchr(p, ']'); - size_t node_len = pr - p; - if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - cn = clusterLookupNode(p, CLUSTER_NAMELEN); - if (!cn) { - cn = createClusterNode(p,0); - clusterAddNode(cn); - } - if (direction == '>') { - server.cluster->migrating_slots_to[slot] = cn; - } else { - server.cluster->importing_slots_from[slot] = cn; - } - continue; - } else if ((p = strchr(argv[j],'-')) != NULL) { - *p = '\0'; - start = atoi(argv[j]); - stop = atoi(p+1); - } else { - start = stop = atoi(argv[j]); - } - if (start < 0 || start >= CLUSTER_SLOTS || - stop < 0 || stop >= CLUSTER_SLOTS) - { - sdsfreesplitres(argv,argc); - goto fmterr; - } - while(start <= stop) clusterAddSlot(n, start++); - } - - sdsfreesplitres(argv,argc); - } - /* Config sanity check */ - if (server.cluster->myself == NULL) goto fmterr; - - zfree(line); - fclose(fp); - - serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name); - - /* Something that should never happen: currentEpoch smaller than - * the max epoch found in the nodes configuration. However we handle this - * as some form of protection against manual editing of critical files. */ - if (clusterGetMaxEpoch() > server.cluster->currentEpoch) { - server.cluster->currentEpoch = clusterGetMaxEpoch(); - } - return C_OK; - -fmterr: - serverLog(LL_WARNING, - "Unrecoverable error: corrupted cluster config file \"%s\".", line); - zfree(line); - if (fp) fclose(fp); - exit(1); -} - -/* Cluster node configuration is exactly the same as CLUSTER NODES output. - * - * This function writes the node config and returns 0, on error -1 - * is returned. - * - * Note: we need to write the file in an atomic way from the point of view - * of the POSIX filesystem semantics, so that if the server is stopped - * or crashes during the write, we'll end with either the old file or the - * new one. Since we have the full payload to write available we can use - * a single write to write the whole file. If the pre-existing file was - * bigger we pad our payload with newlines that are anyway ignored and truncate - * the file afterward. */ -int clusterSaveConfig(int do_fsync) { - sds ci,tmpfilename; - size_t content_size,offset = 0; - ssize_t written_bytes; - int fd = -1; - int retval = C_ERR; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; - - /* Get the nodes description and concatenate our "vars" directive to - * save currentEpoch and lastVoteEpoch. */ - ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0); - ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n", - (unsigned long long) server.cluster->currentEpoch, - (unsigned long long) server.cluster->lastVoteEpoch); - content_size = sdslen(ci); - - /* Create a temp file with the new content. */ - tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I", - server.cluster_configfile,(int) getpid(),mstime()); - if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { - serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno)); - goto cleanup; - } - - while (offset < content_size) { - written_bytes = write(fd,ci + offset,content_size - offset); - if (written_bytes <= 0) { - if (errno == EINTR) continue; - serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s", - offset,strerror(errno)); - goto cleanup; - } - offset += written_bytes; - } - - if (do_fsync) { - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; - if (redis_fsync(fd) == -1) { - serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno)); - goto cleanup; - } - } - - if (rename(tmpfilename, server.cluster_configfile) == -1) { - serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno)); - goto cleanup; - } - - if (do_fsync) { - if (fsyncFileDir(server.cluster_configfile) == -1) { - serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno)); - goto cleanup; - } - } - retval = C_OK; /* If we reached this point, everything is fine. */ - -cleanup: - if (fd != -1) close(fd); - if (retval) unlink(tmpfilename); - sdsfree(tmpfilename); - sdsfree(ci); - return retval; -} - -void clusterSaveConfigOrDie(int do_fsync) { - if (clusterSaveConfig(do_fsync) == -1) { - serverLog(LL_WARNING,"Fatal: can't update cluster config file."); - exit(1); - } -} - -/* Lock the cluster config using flock(), and retain the file descriptor used to - * acquire the lock so that the file will be locked as long as the process is up. - * - * This works because we always update nodes.conf with a new version - * in-place, reopening the file, and writing to it in place (later adjusting - * the length with ftruncate()). - * - * On success C_OK is returned, otherwise an error is logged and - * the function returns C_ERR to signal a lock was not acquired. */ -int clusterLockConfig(char *filename) { -/* flock() does not exist on Solaris - * and a fcntl-based solution won't help, as we constantly re-open that file, - * which will release _all_ locks anyway - */ -#if !defined(__sun) - /* To lock it, we need to open the file in a way it is created if - * it does not exist, otherwise there is a race condition with other - * processes. */ - int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644); - if (fd == -1) { - serverLog(LL_WARNING, - "Can't open %s in order to acquire a lock: %s", - filename, strerror(errno)); - return C_ERR; - } - - if (flock(fd,LOCK_EX|LOCK_NB) == -1) { - if (errno == EWOULDBLOCK) { - serverLog(LL_WARNING, - "Sorry, the cluster configuration file %s is already used " - "by a different Redis Cluster node. Please make sure that " - "different nodes use different cluster configuration " - "files.", filename); - } else { - serverLog(LL_WARNING, - "Impossible to lock %s: %s", filename, strerror(errno)); - } - close(fd); - return C_ERR; - } - /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that - * we'll retain the lock to the file as long as the process exists. - * - * After fork, the child process will get the fd opened by the parent process, - * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), - * it will be closed in the child process. - * If it is not closed, when the main process is killed -9, but the child process - * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the - * child process, and the main process will fail to get lock, means fail to start. */ - server.cluster_config_file_lock_fd = fd; -#else - UNUSED(filename); -#endif /* __sun */ - - return C_OK; -} - -/* Derives our ports to be announced in the cluster bus. */ -void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port, - int *announced_cport) { - /* Config overriding announced ports. */ - *announced_tcp_port = server.cluster_announce_port ? - server.cluster_announce_port : server.port; - *announced_tls_port = server.cluster_announce_tls_port ? - server.cluster_announce_tls_port : server.tls_port; - /* Derive cluster bus port. */ - if (server.cluster_announce_bus_port) { - *announced_cport = server.cluster_announce_bus_port; - } else if (server.cluster_port) { - *announced_cport = server.cluster_port; - } else { - *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR; - } -} - -/* Some flags (currently just the NOFAILOVER flag) may need to be updated - * in the "myself" node based on the current configuration of the node, - * that may change at runtime via CONFIG SET. This function changes the - * set of flags in myself->flags accordingly. */ -void clusterUpdateMyselfFlags(void) { - if (!myself) return; - int oldflags = myself->flags; - int nofailover = server.cluster_slave_no_failover ? - CLUSTER_NODE_NOFAILOVER : 0; - myself->flags &= ~CLUSTER_NODE_NOFAILOVER; - myself->flags |= nofailover; - if (myself->flags != oldflags) { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } -} - - -/* We want to take myself->port/cport/pport in sync with the -* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option. -* The option can be set at runtime via CONFIG SET. */ -void clusterUpdateMyselfAnnouncedPorts(void) { - if (!myself) return; - deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport); -} - -/* We want to take myself->ip in sync with the cluster-announce-ip option. -* The option can be set at runtime via CONFIG SET. */ -void clusterUpdateMyselfIp(void) { - if (!myself) return; - static char *prev_ip = NULL; - char *curr_ip = server.cluster_announce_ip; - int changed = 0; - - if (prev_ip == NULL && curr_ip != NULL) changed = 1; - else if (prev_ip != NULL && curr_ip == NULL) changed = 1; - else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1; - - if (changed) { - if (prev_ip) zfree(prev_ip); - prev_ip = curr_ip; - - if (curr_ip) { - /* We always take a copy of the previous IP address, by - * duplicating the string. This way later we can check if - * the address really changed. */ - prev_ip = zstrdup(prev_ip); - redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN); - } else { - myself->ip[0] = '\0'; /* Force autodetection. */ - } - } -} - -/* Update the hostname for the specified node with the provided C string. */ -static void updateAnnouncedHostname(clusterNode *node, char *new) { - /* Previous and new hostname are the same, no need to update. */ - if (new && !strcmp(new, node->hostname)) { - return; - } else if (!new && (sdslen(node->hostname) == 0)) { - return; - } - - if (new) { - node->hostname = sdscpy(node->hostname, new); - } else if (sdslen(node->hostname) != 0) { - sdsclear(node->hostname); - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); -} - -static void updateAnnouncedHumanNodename(clusterNode *node, char *new) { - if (new && !strcmp(new, node->human_nodename)) { - return; - } else if (!new && (sdslen(node->human_nodename) == 0)) { - return; - } - - if (new) { - node->human_nodename = sdscpy(node->human_nodename, new); - } else if (sdslen(node->human_nodename) != 0) { - sdsclear(node->human_nodename); - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); -} - - -static void updateShardId(clusterNode *node, const char *shard_id) { - if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { - clusterRemoveNodeFromShard(node); - memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(shard_id, node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - if (shard_id && myself != node && myself->slaveof == node) { - if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { - /* shard-id can diverge right after a rolling upgrade - * from pre-7.2 releases */ - clusterRemoveNodeFromShard(myself); - memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(shard_id, myself); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); - } - } -} - -/* Update my hostname based on server configuration values */ -void clusterUpdateMyselfHostname(void) { - if (!myself) return; - updateAnnouncedHostname(myself, server.cluster_announce_hostname); -} - -void clusterUpdateMyselfHumanNodename(void) { - if (!myself) return; - updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename); -} - -void clusterInit(void) { - int saveconf = 0; - - server.cluster = zmalloc(sizeof(clusterState)); - server.cluster->myself = NULL; - server.cluster->currentEpoch = 0; - server.cluster->state = CLUSTER_FAIL; - server.cluster->size = 1; - server.cluster->todo_before_sleep = 0; - server.cluster->nodes = dictCreate(&clusterNodesDictType); - server.cluster->shards = dictCreate(&clusterSdsToListType); - server.cluster->nodes_black_list = - dictCreate(&clusterNodesBlackListDictType); - server.cluster->failover_auth_time = 0; - server.cluster->failover_auth_count = 0; - server.cluster->failover_auth_rank = 0; - server.cluster->failover_auth_epoch = 0; - server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; - server.cluster->lastVoteEpoch = 0; - - /* Initialize stats */ - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - server.cluster->stats_bus_messages_sent[i] = 0; - server.cluster->stats_bus_messages_received[i] = 0; - } - server.cluster->stats_pfail_nodes = 0; - server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; - - memset(server.cluster->slots,0, sizeof(server.cluster->slots)); - clusterCloseAllSlots(); - - memset(server.cluster->owner_not_claiming_slot, 0, sizeof(server.cluster->owner_not_claiming_slot)); - - /* Lock the cluster config file to make sure every node uses - * its own nodes.conf. */ - server.cluster_config_file_lock_fd = -1; - if (clusterLockConfig(server.cluster_configfile) == C_ERR) - exit(1); - - /* Load or create a new nodes configuration. */ - if (clusterLoadConfig(server.cluster_configfile) == C_ERR) { - /* No configuration found. We will just use the random name provided - * by the createClusterNode() function. */ - myself = server.cluster->myself = - createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER); - serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s", - myself->name); - clusterAddNode(myself); - clusterAddNodeToShard(myself->shard_id, myself); - saveconf = 1; - } - if (saveconf) clusterSaveConfigOrDie(1); - - /* Port sanity check II - * The other handshake port check is triggered too late to stop - * us from trying to use a too-high cluster port number. */ - int port = defaultClientPort(); - if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) { - serverLog(LL_WARNING, "Redis port number too high. " - "Cluster communication port is 10,000 port " - "numbers higher than your Redis port. " - "Your Redis port number must be 55535 or less."); - exit(1); - } - if (!server.bindaddr_count) { - serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus."); - exit(1); - } - - /* The slots -> channels map is a radix tree. Initialize it here. */ - server.cluster->slots_to_channels = raxNew(); - - /* Set myself->port/cport/pport to my listening ports, we'll just need to - * discover the IP address via MEET messages. */ - deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); - - server.cluster->mf_end = 0; - server.cluster->mf_slave = NULL; - resetManualFailover(); - clusterUpdateMyselfFlags(); - clusterUpdateMyselfIp(); - clusterUpdateMyselfHostname(); - clusterUpdateMyselfHumanNodename(); -} - -void clusterInitListeners(void) { - if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) { - serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL)); - exit(1); - } - - int port = defaultClientPort(); - connListener *listener = &server.clistener; - listener->count = 0; - listener->bindaddr = server.bindaddr; - listener->bindaddr_count = server.bindaddr_count; - listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR; - listener->ct = connTypeOfCluster(); - if (connListen(listener) == C_ERR ) { - /* Note: the following log text is matched by the test suite. */ - serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port); - exit(1); - } - - if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) { - serverPanic("Unrecoverable error creating Redis Cluster socket accept handler."); - } -} - -/* Reset a node performing a soft or hard reset: - * - * 1) All other nodes are forgotten. - * 2) All the assigned / open slots are released. - * 3) If the node is a slave, it turns into a master. - * 4) Only for hard reset: a new Node ID is generated. - * 5) Only for hard reset: currentEpoch and configEpoch are set to 0. - * 6) The new configuration is saved and the cluster state updated. - * 7) If the node was a slave, the whole data set is flushed away. */ -void clusterReset(int hard) { - dictIterator *di; - dictEntry *de; - int j; - - /* Turn into master. */ - if (nodeIsSlave(myself)) { - clusterSetNodeAsMaster(myself); - replicationUnsetMaster(); - emptyData(-1,EMPTYDB_NO_FLAGS,NULL); - } - - /* Close slots, reset manual failover state. */ - clusterCloseAllSlots(); - resetManualFailover(); - - /* Unassign all the slots. */ - for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j); - - /* Recreate shards dict */ - dictEmpty(server.cluster->shards, NULL); - - /* Forget all the nodes, but myself. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node == myself) continue; - clusterDelNode(node); - } - dictReleaseIterator(di); - - /* Empty the nodes blacklist. */ - dictEmpty(server.cluster->nodes_black_list, NULL); - - /* Hard reset only: set epochs to 0, change node ID. */ - if (hard) { - sds oldname; - - server.cluster->currentEpoch = 0; - server.cluster->lastVoteEpoch = 0; - myself->configEpoch = 0; - serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD"); - - /* To change the Node ID we need to remove the old name from the - * nodes table, change the ID, and re-add back with new name. */ - oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN); - dictDelete(server.cluster->nodes,oldname); - sdsfree(oldname); - getRandomHexChars(myself->name, CLUSTER_NAMELEN); - getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN); - clusterAddNode(myself); - serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name); - } - - /* Re-populate shards */ - clusterAddNodeToShard(myself->shard_id, myself); - - /* Make sure to persist the new config and update the state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER communication link - * -------------------------------------------------------------------------- */ -static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { - uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg); - clusterMsgSendBlock *msgblock = zcalloc(blocklen); - msgblock->refcount = 1; - msgblock->totlen = blocklen; - server.stat_cluster_links_memory += blocklen; - clusterBuildMessageHdr(&msgblock->msg,type,msglen); - return msgblock; -} - -static void clusterMsgSendBlockDecrRefCount(void *node) { - clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node; - msgblock->refcount--; - serverAssert(msgblock->refcount >= 0); - if (msgblock->refcount == 0) { - server.stat_cluster_links_memory -= msgblock->totlen; - zfree(msgblock); - } -} - -clusterLink *createClusterLink(clusterNode *node) { - clusterLink *link = zmalloc(sizeof(*link)); - link->ctime = mstime(); - link->send_msg_queue = listCreate(); - listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount); - link->head_msg_send_offset = 0; - link->send_msg_queue_mem = sizeof(list); - link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); - link->rcvbuf_len = 0; - server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem; - link->conn = NULL; - link->node = node; - /* Related node can only possibly be known at link creation time if this is an outbound link */ - link->inbound = (node == NULL); - if (!link->inbound) { - node->link = link; - } - return link; -} - -/* Free a cluster link, but does not free the associated node of course. - * This function will just make sure that the original node associated - * with this link will have the 'link' field set to NULL. */ -void freeClusterLink(clusterLink *link) { - if (link->conn) { - connClose(link->conn); - link->conn = NULL; - } - server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode); - listRelease(link->send_msg_queue); - server.stat_cluster_links_memory -= link->rcvbuf_alloc; - zfree(link->rcvbuf); - if (link->node) { - if (link->node->link == link) { - serverAssert(!link->inbound); - link->node->link = NULL; - } else if (link->node->inbound_link == link) { - serverAssert(link->inbound); - link->node->inbound_link = NULL; - } - } - zfree(link); -} - -void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { - serverAssert(!link->node); - serverAssert(link->inbound); - if (node->inbound_link) { - /* A peer may disconnect and then reconnect with us, and it's not guaranteed that - * we would always process the disconnection of the existing inbound link before - * accepting a new existing inbound link. Therefore, it's possible to have more than - * one inbound link from the same node at the same time. Our cleanup logic assumes - * a one to one relationship between nodes and inbound links, so we need to kill - * one of the links. The existing link is more likely the outdated one, but it's - * possible the other node may need to open another link. */ - serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", - node->inbound_link->conn->fd, node->name, link->conn->fd); - freeClusterLink(node->inbound_link); - } - serverAssert(!node->inbound_link); - node->inbound_link = link; - link->node = node; -} - -static void clusterConnAcceptHandler(connection *conn) { - clusterLink *link; - - if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_VERBOSE, - "Error accepting cluster node connection: %s", connGetLastError(conn)); - connClose(conn); - return; - } - - /* Create a link object we use to handle the connection. - * It gets passed to the readable handler when data is available. - * Initially the link->node pointer is set to NULL as we don't know - * which node is, but the right node is references once we know the - * node identity. */ - link = createClusterLink(NULL); - link->conn = conn; - connSetPrivateData(conn, link); - - /* Register read handler */ - connSetReadHandler(conn, clusterReadHandler); -} - -#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 -void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cport, cfd; - int max = MAX_CLUSTER_ACCEPTS_PER_CALL; - char cip[NET_IP_STR_LEN]; - int require_auth = TLS_CLIENT_AUTH_YES; - UNUSED(el); - UNUSED(mask); - UNUSED(privdata); - - /* If the server is starting up, don't accept cluster connections: - * UPDATE messages may interact with the database content. */ - if (server.masterhost == NULL && server.loading) return; - - while(max--) { - cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); - if (cfd == ANET_ERR) { - if (errno != EWOULDBLOCK) - serverLog(LL_VERBOSE, - "Error accepting cluster node: %s", server.neterr); - return; - } - - connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); - - /* Make sure connection is not in an error state */ - if (connGetState(conn) != CONN_STATE_ACCEPTING) { - serverLog(LL_VERBOSE, - "Error creating an accepting connection for cluster node: %s", - connGetLastError(conn)); - connClose(conn); - return; - } - connEnableTcpNoDelay(conn); - connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2); - - /* Use non-blocking I/O for cluster messages. */ - serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport); - - /* Accept the connection now. connAccept() may call our handler directly - * or schedule it for later depending on connection implementation. - */ - if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) { - if (connGetState(conn) == CONN_STATE_ERROR) - serverLog(LL_VERBOSE, - "Error accepting cluster node connection: %s", - connGetLastError(conn)); - connClose(conn); - return; - } - } -} - -/* Return the approximated number of sockets we are using in order to - * take the cluster bus connections. */ -unsigned long getClusterConnectionsCount(void) { - /* We decrement the number of nodes by one, since there is the - * "myself" node too in the list. Each node uses two file descriptors, - * one incoming and one outgoing, thus the multiplication by 2. */ - return server.cluster_enabled ? - ((dictSize(server.cluster->nodes)-1)*2) : 0; -} - /* ----------------------------------------------------------------------------- * Key space handling * -------------------------------------------------------------------------- */ @@ -1343,7 +10,7 @@ unsigned long getClusterConnectionsCount(void) { /* We have 16384 hash slots. The hash slot of a given key is obtained * as the least significant 14 bits of the crc16 of the key. * - * However if the key contains the {...} pattern, only the part between + * However, if the key contains the {...} pattern, only the part between * { and } is hashed. This may be useful in the future to force certain * keys to be in the same node (assuming no resharding is in progress). */ unsigned int keyHashSlot(char *key, int keylen) { @@ -1397,5141 +64,12 @@ int patternHashSlot(char *pattern, int length) { return crc16(pattern, length) & 0x3FFF; } -/* ----------------------------------------------------------------------------- - * CLUSTER node API - * -------------------------------------------------------------------------- */ - -/* Create a new cluster node, with the specified flags. - * If "nodename" is NULL this is considered a first handshake and a random - * node name is assigned to this node (it will be fixed later when we'll - * receive the first pong). - * - * The node is created and returned to the user, but it is not automatically - * added to the nodes hash table. */ -clusterNode *createClusterNode(char *nodename, int flags) { - clusterNode *node = zmalloc(sizeof(*node)); - - if (nodename) - memcpy(node->name, nodename, CLUSTER_NAMELEN); - else - getRandomHexChars(node->name, CLUSTER_NAMELEN); - getRandomHexChars(node->shard_id, CLUSTER_NAMELEN); - node->ctime = mstime(); - node->configEpoch = 0; - node->flags = flags; - memset(node->slots,0,sizeof(node->slots)); - node->slot_info_pairs = NULL; - node->slot_info_pairs_count = 0; - node->numslots = 0; - node->numslaves = 0; - node->slaves = NULL; - node->slaveof = NULL; - node->last_in_ping_gossip = 0; - node->ping_sent = node->pong_received = 0; - node->data_received = 0; - node->fail_time = 0; - node->link = NULL; - node->inbound_link = NULL; - memset(node->ip,0,sizeof(node->ip)); - node->hostname = sdsempty(); - node->human_nodename = sdsempty(); - node->tcp_port = 0; - node->cport = 0; - node->tls_port = 0; - node->fail_reports = listCreate(); - node->voted_time = 0; - node->orphaned_time = 0; - node->repl_offset_time = 0; - node->repl_offset = 0; - listSetFreeMethod(node->fail_reports,zfree); - return node; -} - -/* This function is called every time we get a failure report from a node. - * The side effect is to populate the fail_reports list (or to update - * the timestamp of an existing report). - * - * 'failing' is the node that is in failure state according to the - * 'sender' node. - * - * The function returns 0 if it just updates a timestamp of an existing - * failure report from the same sender. 1 is returned if a new failure - * report is created. */ -int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { - list *l = failing->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - - /* If a failure report from the same sender already exists, just update - * the timestamp. */ - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (fr->node == sender) { - fr->time = mstime(); - return 0; - } - } - - /* Otherwise create a new report. */ - fr = zmalloc(sizeof(*fr)); - fr->node = sender; - fr->time = mstime(); - listAddNodeTail(l,fr); - return 1; -} - -/* Remove failure reports that are too old, where too old means reasonably - * older than the global node timeout. Note that anyway for a node to be - * flagged as FAIL we need to have a local PFAIL state that is at least - * older than the global node timeout, so we don't just trust the number - * of failure reports from other nodes. */ -void clusterNodeCleanupFailureReports(clusterNode *node) { - list *l = node->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - mstime_t maxtime = server.cluster_node_timeout * - CLUSTER_FAIL_REPORT_VALIDITY_MULT; - mstime_t now = mstime(); - - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (now - fr->time > maxtime) listDelNode(l,ln); - } -} - -/* Remove the failing report for 'node' if it was previously considered - * failing by 'sender'. This function is called when a node informs us via - * gossip that a node is OK from its point of view (no FAIL or PFAIL flags). - * - * Note that this function is called relatively often as it gets called even - * when there are no nodes failing, and is O(N), however when the cluster is - * fine the failure reports list is empty so the function runs in constant - * time. - * - * The function returns 1 if the failure report was found and removed. - * Otherwise 0 is returned. */ -int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { - list *l = node->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - - /* Search for a failure report from this sender. */ - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (fr->node == sender) break; - } - if (!ln) return 0; /* No failure report from this sender. */ - - /* Remove the failure report. */ - listDelNode(l,ln); - clusterNodeCleanupFailureReports(node); - return 1; -} - -/* Return the number of external nodes that believe 'node' is failing, - * not including this node, that may have a PFAIL or FAIL state for this - * node as well. */ -int clusterNodeFailureReportsCount(clusterNode *node) { - clusterNodeCleanupFailureReports(node); - return listLength(node->fail_reports); -} - -int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) { - int j; - - for (j = 0; j < master->numslaves; j++) { - if (master->slaves[j] == slave) { - if ((j+1) < master->numslaves) { - int remaining_slaves = (master->numslaves - j) - 1; - memmove(master->slaves+j,master->slaves+(j+1), - (sizeof(*master->slaves) * remaining_slaves)); - } - master->numslaves--; - if (master->numslaves == 0) - master->flags &= ~CLUSTER_NODE_MIGRATE_TO; - return C_OK; - } - } - return C_ERR; -} - -int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) { - int j; - - /* If it's already a slave, don't add it again. */ - for (j = 0; j < master->numslaves; j++) - if (master->slaves[j] == slave) return C_ERR; - master->slaves = zrealloc(master->slaves, - sizeof(clusterNode*)*(master->numslaves+1)); - master->slaves[master->numslaves] = slave; - master->numslaves++; - master->flags |= CLUSTER_NODE_MIGRATE_TO; - return C_OK; -} - -int clusterCountNonFailingSlaves(clusterNode *n) { - int j, okslaves = 0; - - for (j = 0; j < n->numslaves; j++) - if (!nodeFailed(n->slaves[j])) okslaves++; - return okslaves; -} - -/* Low level cleanup of the node structure. Only called by clusterDelNode(). */ -void freeClusterNode(clusterNode *n) { - sds nodename; - int j; - - /* If the node has associated slaves, we have to set - * all the slaves->slaveof fields to NULL (unknown). */ - for (j = 0; j < n->numslaves; j++) - n->slaves[j]->slaveof = NULL; - - /* Remove this node from the list of slaves of its master. */ - if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); - - /* Unlink from the set of nodes. */ - nodename = sdsnewlen(n->name, CLUSTER_NAMELEN); - serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK); - sdsfree(nodename); - sdsfree(n->hostname); - sdsfree(n->human_nodename); - - /* Release links and associated data structures. */ - if (n->link) freeClusterLink(n->link); - if (n->inbound_link) freeClusterLink(n->inbound_link); - listRelease(n->fail_reports); - zfree(n->slaves); - zfree(n); -} - -/* Add a node to the nodes hash table */ -void clusterAddNode(clusterNode *node) { - int retval; - - retval = dictAdd(server.cluster->nodes, - sdsnewlen(node->name,CLUSTER_NAMELEN), node); - serverAssert(retval == DICT_OK); -} - -/* Remove a node from the cluster. The function performs the high level - * cleanup, calling freeClusterNode() for the low level cleanup. - * Here we do the following: - * - * 1) Mark all the slots handled by it as unassigned. - * 2) Remove all the failure reports sent by this node and referenced by - * other nodes. - * 3) Remove the node from the owning shard - * 4) Free the node with freeClusterNode() that will in turn remove it - * from the hash table and from the list of slaves of its master, if - * it is a slave node. - */ -void clusterDelNode(clusterNode *delnode) { - int j; - dictIterator *di; - dictEntry *de; - - /* 1) Mark slots as unassigned. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->importing_slots_from[j] == delnode) - server.cluster->importing_slots_from[j] = NULL; - if (server.cluster->migrating_slots_to[j] == delnode) - server.cluster->migrating_slots_to[j] = NULL; - if (server.cluster->slots[j] == delnode) - clusterDelSlot(j); - } - - /* 2) Remove failure reports. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node == delnode) continue; - clusterNodeDelFailureReport(node,delnode); - } - dictReleaseIterator(di); - - /* 3) Remove the node from the owning shard */ - clusterRemoveNodeFromShard(delnode); - - /* 4) Free the node, unlinking it from the cluster. */ - freeClusterNode(delnode); -} - -/* Cluster node sanity check. Returns C_OK if the node id - * is valid an C_ERR otherwise. */ -int verifyClusterNodeId(const char *name, int length) { - if (length != CLUSTER_NAMELEN) return C_ERR; - for (int i = 0; i < length; i++) { - if (name[i] >= 'a' && name[i] <= 'z') continue; - if (name[i] >= '0' && name[i] <= '9') continue; - return C_ERR; - } - return C_OK; -} - -/* Node lookup by name */ -clusterNode *clusterLookupNode(const char *name, int length) { - if (verifyClusterNodeId(name, length) != C_OK) return NULL; - sds s = sdsnewlen(name, length); - dictEntry *de = dictFind(server.cluster->nodes, s); - sdsfree(s); - if (de == NULL) return NULL; - return dictGetVal(de); -} - -/* Get all the nodes in my shard. - * Note that the list returned is not computed on the fly - * via slaveof; rather, it is maintained permanently to - * track the shard membership and its life cycle is tied - * to this Redis process. Therefore, the caller must not - * release the list. */ -list *clusterGetNodesInMyShard(clusterNode *node) { - sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards,s); - sdsfree(s); - return (de != NULL) ? dictGetVal(de) : NULL; -} - -/* This is only used after the handshake. When we connect a given IP/PORT - * as a result of CLUSTER MEET we don't have the node name yet, so we - * pick a random one, and will fix it when we receive the PONG request using - * this function. */ -void clusterRenameNode(clusterNode *node, char *newname) { - int retval; - sds s = sdsnewlen(node->name, CLUSTER_NAMELEN); - - serverLog(LL_DEBUG,"Renaming node %.40s into %.40s", - node->name, newname); - retval = dictDelete(server.cluster->nodes, s); - sdsfree(s); - serverAssert(retval == DICT_OK); - memcpy(node->name, newname, CLUSTER_NAMELEN); - clusterAddNode(node); -} - -void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { - sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards,s); - if (de == NULL) { - list *l = listCreate(); - listAddNodeTail(l, node); - serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); - } else { - list *l = dictGetVal(de); - if (listSearchKey(l, node) == NULL) { - listAddNodeTail(l, node); - } - sdsfree(s); - } -} - -void clusterRemoveNodeFromShard(clusterNode *node) { - sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards, s); - if (de != NULL) { - list *l = dictGetVal(de); - listNode *ln = listSearchKey(l, node); - if (ln != NULL) { - listDelNode(l, ln); - } - if (listLength(l) == 0) { - dictDelete(server.cluster->shards, s); - } - } - sdsfree(s); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER config epoch handling - * -------------------------------------------------------------------------- */ - -/* Return the greatest configEpoch found in the cluster, or the current - * epoch if greater than any node configEpoch. */ -uint64_t clusterGetMaxEpoch(void) { - uint64_t max = 0; - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->configEpoch > max) max = node->configEpoch; - } - dictReleaseIterator(di); - if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; - return max; -} - -/* If this node epoch is zero or is not already the greatest across the - * cluster (from the POV of the local configuration), this function will: - * - * 1) Generate a new config epoch, incrementing the current epoch. - * 2) Assign the new epoch to this node, WITHOUT any consensus. - * 3) Persist the configuration on disk before sending packets with the - * new configuration. - * - * If the new config epoch is generated and assigned, C_OK is returned, - * otherwise C_ERR is returned (since the node has already the greatest - * configuration around) and no operation is performed. - * - * Important note: this function violates the principle that config epochs - * should be generated with consensus and should be unique across the cluster. - * However Redis Cluster uses this auto-generated new config epochs in two - * cases: - * - * 1) When slots are closed after importing. Otherwise resharding would be - * too expensive. - * 2) When CLUSTER FAILOVER is called with options that force a slave to - * failover its master even if there is not master majority able to - * create a new configuration epoch. - * - * Redis Cluster will not explode using this function, even in the case of - * a collision between this node and another node, generating the same - * configuration epoch unilaterally, because the config epoch conflict - * resolution algorithm will eventually move colliding nodes to different - * config epochs. However using this function may violate the "last failover - * wins" rule, so should only be used with care. */ -int clusterBumpConfigEpochWithoutConsensus(void) { - uint64_t maxEpoch = clusterGetMaxEpoch(); - - if (myself->configEpoch == 0 || - myself->configEpoch != maxEpoch) - { - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - serverLog(LL_NOTICE, - "New configEpoch set to %llu", - (unsigned long long) myself->configEpoch); - return C_OK; - } else { - return C_ERR; - } -} - -/* This function is called when this node is a master, and we receive from - * another master a configuration epoch that is equal to our configuration - * epoch. - * - * BACKGROUND - * - * It is not possible that different slaves get the same config - * epoch during a failover election, because the slaves need to get voted - * by a majority. However when we perform a manual resharding of the cluster - * the node will assign a configuration epoch to itself without to ask - * for agreement. Usually resharding happens when the cluster is working well - * and is supervised by the sysadmin, however it is possible for a failover - * to happen exactly while the node we are resharding a slot to assigns itself - * a new configuration epoch, but before it is able to propagate it. - * - * So technically it is possible in this condition that two nodes end with - * the same configuration epoch. - * - * Another possibility is that there are bugs in the implementation causing - * this to happen. - * - * Moreover when a new cluster is created, all the nodes start with the same - * configEpoch. This collision resolution code allows nodes to automatically - * end with a different configEpoch at startup automatically. - * - * In all the cases, we want a mechanism that resolves this issue automatically - * as a safeguard. The same configuration epoch for masters serving different - * set of slots is not harmful, but it is if the nodes end serving the same - * slots for some reason (manual errors or software bugs) without a proper - * failover procedure. - * - * In general we want a system that eventually always ends with different - * masters having different configuration epochs whatever happened, since - * nothing is worse than a split-brain condition in a distributed system. - * - * BEHAVIOR - * - * When this function gets called, what happens is that if this node - * has the lexicographically smaller Node ID compared to the other node - * with the conflicting epoch (the 'sender' node), it will assign itself - * the greatest configuration epoch currently detected among nodes plus 1. - * - * This means that even if there are multiple nodes colliding, the node - * with the greatest Node ID never moves forward, so eventually all the nodes - * end with a different configuration epoch. - */ -void clusterHandleConfigEpochCollision(clusterNode *sender) { - /* Prerequisites: nodes have the same configEpoch and are both masters. */ - if (sender->configEpoch != myself->configEpoch || - !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; - /* Don't act if the colliding node has a smaller Node ID. */ - if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return; - /* Get the next ID available at the best of this node knowledge. */ - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterSaveConfigOrDie(1); - serverLog(LL_VERBOSE, - "WARNING: configEpoch collision with node %.40s (%s)." - " configEpoch set to %llu", - sender->name,sender->human_nodename, - (unsigned long long) myself->configEpoch); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER nodes blacklist - * - * The nodes blacklist is just a way to ensure that a given node with a given - * Node ID is not re-added before some time elapsed (this time is specified - * in seconds in CLUSTER_BLACKLIST_TTL). - * - * This is useful when we want to remove a node from the cluster completely: - * when CLUSTER FORGET is called, it also puts the node into the blacklist so - * that even if we receive gossip messages from other nodes that still remember - * about the node we want to remove, we don't re-add it before some time. - * - * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means - * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes - * in the cluster without dealing with the problem of other nodes re-adding - * back the node to nodes we already sent the FORGET command to. - * - * The data structure used is a hash table with an sds string representing - * the node ID as key, and the time when it is ok to re-add the node as - * value. - * -------------------------------------------------------------------------- */ - -#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */ - - -/* Before of the addNode() or Exists() operations we always remove expired - * entries from the black list. This is an O(N) operation but it is not a - * problem since add / exists operations are called very infrequently and - * the hash table is supposed to contain very little elements at max. - * However without the cleanup during long uptime and with some automated - * node add/removal procedures, entries could accumulate. */ -void clusterBlacklistCleanup(void) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes_black_list); - while((de = dictNext(di)) != NULL) { - int64_t expire = dictGetUnsignedIntegerVal(de); - - if (expire < server.unixtime) - dictDelete(server.cluster->nodes_black_list,dictGetKey(de)); - } - dictReleaseIterator(di); -} - -/* Cleanup the blacklist and add a new node ID to the black list. */ -void clusterBlacklistAddNode(clusterNode *node) { - dictEntry *de; - sds id = sdsnewlen(node->name,CLUSTER_NAMELEN); - - clusterBlacklistCleanup(); - if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) { - /* If the key was added, duplicate the sds string representation of - * the key for the next lookup. We'll free it at the end. */ - id = sdsdup(id); - } - de = dictFind(server.cluster->nodes_black_list,id); - dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL); - sdsfree(id); -} - -/* Return non-zero if the specified node ID exists in the blacklist. - * You don't need to pass an sds string here, any pointer to 40 bytes - * will work. */ -int clusterBlacklistExists(char *nodeid) { - sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN); - int retval; - - clusterBlacklistCleanup(); - retval = dictFind(server.cluster->nodes_black_list,id) != NULL; - sdsfree(id); - return retval; -} - -/* ----------------------------------------------------------------------------- - * CLUSTER messages exchange - PING/PONG and gossip - * -------------------------------------------------------------------------- */ - -/* This function checks if a given node should be marked as FAIL. - * It happens if the following conditions are met: - * - * 1) We received enough failure reports from other master nodes via gossip. - * Enough means that the majority of the masters signaled the node is - * down recently. - * 2) We believe this node is in PFAIL state. - * - * If a failure is detected we also inform the whole cluster about this - * event trying to force every other node to set the FAIL flag for the node. - * - * Note that the form of agreement used here is weak, as we collect the majority - * of masters state during some time, and even if we force agreement by - * propagating the FAIL message, because of partitions we may not reach every - * node. However: - * - * 1) Either we reach the majority and eventually the FAIL state will propagate - * to all the cluster. - * 2) Or there is no majority so no slave promotion will be authorized and the - * FAIL flag will be cleared after some time. - */ -void markNodeAsFailingIfNeeded(clusterNode *node) { - int failures; - int needed_quorum = (server.cluster->size / 2) + 1; - - if (!nodeTimedOut(node)) return; /* We can reach it. */ - if (nodeFailed(node)) return; /* Already FAILing. */ - - failures = clusterNodeFailureReportsCount(node); - /* Also count myself as a voter if I'm a master. */ - if (nodeIsMaster(myself)) failures++; - if (failures < needed_quorum) return; /* No weak agreement from masters. */ - - serverLog(LL_NOTICE, - "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); - - /* Mark the node as failing. */ - node->flags &= ~CLUSTER_NODE_PFAIL; - node->flags |= CLUSTER_NODE_FAIL; - node->fail_time = mstime(); - - /* Broadcast the failing node name to everybody, forcing all the other - * reachable nodes to flag the node as FAIL. - * We do that even if this node is a replica and not a master: anyway - * the failing state is triggered collecting failure reports from masters, - * so here the replica is only helping propagating this status. */ - clusterSendFail(node->name); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); -} - -/* This function is called only if a node is marked as FAIL, but we are able - * to reach it again. It checks if there are the conditions to undo the FAIL - * state. */ -void clearNodeFailureIfNeeded(clusterNode *node) { - mstime_t now = mstime(); - - serverAssert(nodeFailed(node)); - - /* For slaves we always clear the FAIL flag if we can contact the - * node again. */ - if (nodeIsSlave(node) || node->numslots == 0) { - serverLog(LL_NOTICE, - "Clear FAIL state for node %.40s (%s):%s is reachable again.", - node->name,node->human_nodename, - nodeIsSlave(node) ? "replica" : "master without slots"); - node->flags &= ~CLUSTER_NODE_FAIL; - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - } - - /* If it is a master and... - * 1) The FAIL state is old enough. - * 2) It is yet serving slots from our point of view (not failed over). - * Apparently no one is going to fix these slots, clear the FAIL flag. */ - if (nodeIsMaster(node) && node->numslots > 0 && - (now - node->fail_time) > - (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) - { - serverLog(LL_NOTICE, - "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.", - node->name, node->human_nodename); - node->flags &= ~CLUSTER_NODE_FAIL; - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - } -} - -/* Return true if we already have a node in HANDSHAKE state matching the - * specified ip address and port number. This function is used in order to - * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port, int cport) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (!nodeInHandshake(node)) continue; - if (!strcasecmp(node->ip,ip) && - getNodeDefaultClientPort(node) == port && - node->cport == cport) break; - } - dictReleaseIterator(di); - return de != NULL; -} - -/* Start a handshake with the specified address if there is not one - * already in progress. Returns non-zero if the handshake was actually - * started. On error zero is returned and errno is set to one of the - * following values: - * - * EAGAIN - There is already a handshake in progress for this address. - * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port, int cport) { - clusterNode *n; - char norm_ip[NET_IP_STR_LEN]; - struct sockaddr_storage sa; - - /* IP sanity check */ - if (inet_pton(AF_INET,ip, - &(((struct sockaddr_in *)&sa)->sin_addr))) - { - sa.ss_family = AF_INET; - } else if (inet_pton(AF_INET6,ip, - &(((struct sockaddr_in6 *)&sa)->sin6_addr))) - { - sa.ss_family = AF_INET6; - } else { - errno = EINVAL; - return 0; - } - - /* Port sanity check */ - if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) { - errno = EINVAL; - return 0; - } - - /* Set norm_ip as the normalized string representation of the node - * IP address. */ - memset(norm_ip,0,NET_IP_STR_LEN); - if (sa.ss_family == AF_INET) - inet_ntop(AF_INET, - (void*)&(((struct sockaddr_in *)&sa)->sin_addr), - norm_ip,NET_IP_STR_LEN); - else - inet_ntop(AF_INET6, - (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), - norm_ip,NET_IP_STR_LEN); - - if (clusterHandshakeInProgress(norm_ip,port,cport)) { - errno = EAGAIN; - return 0; - } - - /* Add the node with a random address (NULL as first argument to - * createClusterNode()). Everything will be fixed during the - * handshake. */ - n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET); - memcpy(n->ip,norm_ip,sizeof(n->ip)); +ConnectionType *connTypeOfCluster(void) { if (server.tls_cluster) { - n->tls_port = port; - } else { - n->tcp_port = port; + return connectionTypeTls(); } - n->cport = cport; - clusterAddNode(n); - return 1; -} -static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) { - if (server.tls_cluster) { - *tls_port = ntohs(hdr->port); - *tcp_port = ntohs(hdr->pport); - } else { - *tls_port = ntohs(hdr->pport); - *tcp_port = ntohs(hdr->port); - } -} - -static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) { - if (server.tls_cluster) { - *tls_port = ntohs(g->port); - *tcp_port = ntohs(g->pport); - } else { - *tls_port = ntohs(g->pport); - *tcp_port = ntohs(g->port); - } -} - -/* Process the gossip section of PING or PONG packets. - * Note that this function assumes that the packet is already sanity-checked - * by the caller, not in the content of the gossip section, but in the - * length. */ -void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { - uint16_t count = ntohs(hdr->count); - clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip; - clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - - while(count--) { - uint16_t flags = ntohs(g->flags); - clusterNode *node; - sds ci; - - if (server.verbosity == LL_DEBUG) { - ci = representClusterNodeFlags(sdsempty(), flags); - serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s", - g->nodename, - g->ip, - ntohs(g->port), - ntohs(g->cport), - ci); - sdsfree(ci); - } - - /* Convert port and pport into TCP port and TLS port. */ - int msg_tls_port, msg_tcp_port; - getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port); - - /* Update our state accordingly to the gossip sections */ - node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN); - if (node) { - /* We already know this node. - Handle failure reports, only when the sender is a master. */ - if (sender && nodeIsMaster(sender) && node != myself) { - if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) { - if (clusterNodeAddFailureReport(node,sender)) { - serverLog(LL_VERBOSE, - "Node %.40s (%s) reported node %.40s (%s) as not reachable.", - sender->name, sender->human_nodename, node->name, node->human_nodename); - } - markNodeAsFailingIfNeeded(node); - } else { - if (clusterNodeDelFailureReport(node,sender)) { - serverLog(LL_VERBOSE, - "Node %.40s (%s) reported node %.40s (%s) is back online.", - sender->name, sender->human_nodename, node->name, node->human_nodename); - } - } - } - - /* If from our POV the node is up (no failure flags are set), - * we have no pending ping for the node, nor we have failure - * reports for this node, update the last pong time with the - * one we see from the other nodes. */ - if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && - node->ping_sent == 0 && - clusterNodeFailureReportsCount(node) == 0) - { - mstime_t pongtime = ntohl(g->pong_received); - pongtime *= 1000; /* Convert back to milliseconds. */ - - /* Replace the pong time with the received one only if - * it's greater than our view but is not in the future - * (with 500 milliseconds tolerance) from the POV of our - * clock. */ - if (pongtime <= (server.mstime+500) && - pongtime > node->pong_received) - { - node->pong_received = pongtime; - } - } - - /* If we already know this node, but it is not reachable, and - * we see a different address in the gossip section of a node that - * can talk with this other node, update the address, disconnect - * the old link if any, so that we'll attempt to connect with the - * new address. */ - if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && - !(flags & CLUSTER_NODE_NOADDR) && - !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && - (strcasecmp(node->ip,g->ip) || - node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) || - node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) || - node->cport != ntohs(g->cport))) - { - if (node->link) freeClusterLink(node->link); - memcpy(node->ip,g->ip,NET_IP_STR_LEN); - node->tcp_port = msg_tcp_port; - node->tls_port = msg_tls_port; - node->cport = ntohs(g->cport); - node->flags &= ~CLUSTER_NODE_NOADDR; - } - } else { - /* If it's not in NOADDR state and we don't have it, we - * add it to our trusted dict with exact nodeid and flag. - * Note that we cannot simply start a handshake against - * this IP/PORT pairs, since IP/PORT can be reused already, - * otherwise we risk joining another cluster. - * - * Note that we require that the sender of this gossip message - * is a well known node in our cluster, otherwise we risk - * joining another cluster. */ - if (sender && - !(flags & CLUSTER_NODE_NOADDR) && - !clusterBlacklistExists(g->nodename)) - { - clusterNode *node; - node = createClusterNode(g->nodename, flags); - memcpy(node->ip,g->ip,NET_IP_STR_LEN); - node->tcp_port = msg_tcp_port; - node->tls_port = msg_tls_port; - node->cport = ntohs(g->cport); - clusterAddNode(node); - } - } - - /* Next node */ - g++; - } -} - -/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. - * If 'announced_ip' length is non-zero, it is used instead of extracting - * the IP from the socket peer address. */ -int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { - if (announced_ip[0] != '\0') { - memcpy(buf,announced_ip,NET_IP_STR_LEN); - buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ - return C_OK; - } else { - if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) { - serverLog(LL_NOTICE, "Error converting peer IP to string: %s", - link->conn ? connGetLastError(link->conn) : "no link"); - return C_ERR; - } - return C_OK; - } -} - -/* Update the node address to the IP address that can be extracted - * from link->fd, or if hdr->myip is non empty, to the address the node - * is announcing us. The port is taken from the packet header as well. - * - * If the address or port changed, disconnect the node link so that we'll - * connect again to the new address. - * - * If the ip/port pair are already correct no operation is performed at - * all. - * - * The function returns 0 if the node address is still the same, - * otherwise 1 is returned. */ -int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, - clusterMsg *hdr) -{ - char ip[NET_IP_STR_LEN] = {0}; - int cport = ntohs(hdr->cport); - int tcp_port, tls_port; - getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port); - - /* We don't proceed if the link is the same as the sender link, as this - * function is designed to see if the node link is consistent with the - * symmetric link that is used to receive PINGs from the node. - * - * As a side effect this function never frees the passed 'link', so - * it is safe to call during packet processing. */ - if (link == node->link) return 0; - - /* If the peer IP is unavailable for some reasons like invalid fd or closed - * link, just give up the update this time, and the update will be retried - * in the next round of PINGs */ - if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0; - - if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port && - strcmp(ip,node->ip) == 0) return 0; - - /* IP / port is different, update it. */ - memcpy(node->ip,ip,sizeof(ip)); - node->tcp_port = tcp_port; - node->tls_port = tls_port; - node->cport = cport; - if (node->link) freeClusterLink(node->link); - node->flags &= ~CLUSTER_NODE_NOADDR; - serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d", - node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); - - /* Check if this is our master and we have to change the - * replication target as well. */ - if (nodeIsSlave(myself) && myself->slaveof == node) - replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node)); - return 1; -} - -/* Reconfigure the specified node 'n' as a master. This function is called when - * a node that we believed to be a slave is now acting as master in order to - * update the state of the node. */ -void clusterSetNodeAsMaster(clusterNode *n) { - if (nodeIsMaster(n)) return; - - if (n->slaveof) { - clusterNodeRemoveSlave(n->slaveof,n); - if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO; - } - n->flags &= ~CLUSTER_NODE_SLAVE; - n->flags |= CLUSTER_NODE_MASTER; - n->slaveof = NULL; - - /* Update config and state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); -} - -/* This function is called when we receive a master configuration via a - * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the - * node, and the set of slots claimed under this configEpoch. - * - * What we do is to rebind the slots with newer configuration compared to our - * local configuration, and if needed, we turn ourself into a replica of the - * node (see the function comments for more info). - * - * The 'sender' is the node for which we received a configuration update. - * Sometimes it is not actually the "Sender" of the information, like in the - * case we receive the info via an UPDATE packet. */ -void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { - int j; - clusterNode *curmaster = NULL, *newmaster = NULL; - /* The dirty slots list is a list of slots for which we lose the ownership - * while having still keys inside. This usually happens after a failover - * or after a manual cluster reconfiguration operated by the admin. - * - * If the update message is not able to demote a master to slave (in this - * case we'll resync with the master updating the whole key space), we - * need to delete all the keys in the slots we lost ownership. */ - uint16_t dirty_slots[CLUSTER_SLOTS]; - int dirty_slots_count = 0; - - /* We should detect if sender is new master of our shard. - * We will know it if all our slots were migrated to sender, and sender - * has no slots except ours */ - int sender_slots = 0; - int migrated_our_slots = 0; - - /* Here we set curmaster to this node or the node this node - * replicates to if it's a slave. In the for loop we are - * interested to check if slots are taken away from curmaster. */ - curmaster = nodeIsMaster(myself) ? myself : myself->slaveof; - - if (sender == myself) { - serverLog(LL_NOTICE,"Discarding UPDATE message about myself."); - return; - } - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(slots,j)) { - sender_slots++; - - /* The slot is already bound to the sender of this message. */ - if (server.cluster->slots[j] == sender) { - bitmapClearBit(server.cluster->owner_not_claiming_slot, j); - continue; - } - - /* The slot is in importing state, it should be modified only - * manually via redis-cli (example: a resharding is in progress - * and the migrating side slot was already closed and is advertising - * a new config. We still want the slot to be closed manually). */ - if (server.cluster->importing_slots_from[j]) continue; - - /* We rebind the slot to the new node claiming it if: - * 1) The slot was unassigned or the previous owner no longer owns the slot or - * the new node claims it with a greater configEpoch. - * 2) We are not currently importing the slot. */ - if (isSlotUnclaimed(j) || - server.cluster->slots[j]->configEpoch < senderConfigEpoch) - { - /* Was this slot mine, and still contains keys? Mark it as - * a dirty slot. */ - if (server.cluster->slots[j] == myself && - countKeysInSlot(j) && - sender != myself) - { - dirty_slots[dirty_slots_count] = j; - dirty_slots_count++; - } - - if (server.cluster->slots[j] == curmaster) { - newmaster = sender; - migrated_our_slots++; - } - clusterDelSlot(j); - clusterAddSlot(sender,j); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } - } else if (server.cluster->slots[j] == sender) { - /* The slot is currently bound to the sender but the sender is no longer - * claiming it. We don't want to unbind the slot yet as it can cause the cluster - * to move to FAIL state and also throw client error. Keeping the slot bound to - * the previous owner will cause a few client side redirects, but won't throw - * any errors. We will keep track of the uncertainty in ownership to avoid - * propagating misinformation about this slot's ownership using UPDATE - * messages. */ - bitmapSetBit(server.cluster->owner_not_claiming_slot, j); - } - } - - /* After updating the slots configuration, don't do any actual change - * in the state of the server if a module disabled Redis Cluster - * keys redirections. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return; - - /* If at least one slot was reassigned from a node to another node - * with a greater configEpoch, it is possible that: - * 1) We are a master left without slots. This means that we were - * failed over and we should turn into a replica of the new - * master. - * 2) We are a slave and our master is left without slots. We need - * to replicate to the new slots owner. */ - if (newmaster && curmaster->numslots == 0 && - (server.cluster_allow_replica_migration || - sender_slots == migrated_our_slots)) { - serverLog(LL_NOTICE, - "Configuration change detected. Reconfiguring myself " - "as a replica of %.40s (%s)", sender->name, sender->human_nodename); - clusterSetMaster(sender); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } else if (myself->slaveof && myself->slaveof->slaveof && - /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it - * can happen that myself is a replica of a replica of myself. If - * this happens, we do nothing to avoid a crash and wait for the - * admin to repair the cluster. */ - myself->slaveof->slaveof != myself) - { - /* Safeguard against sub-replicas. A replica's master can turn itself - * into a replica if its last slot is removed. If no other node takes - * over the slot, there is nothing else to trigger replica migration. */ - serverLog(LL_NOTICE, - "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)", - myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename); - clusterSetMaster(myself->slaveof->slaveof); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } else if (dirty_slots_count) { - /* If we are here, we received an update message which removed - * ownership for certain slots we still have keys about, but still - * we are serving some slots, so this master node was not demoted to - * a slave. - * - * In order to maintain a consistent state between keys and slots - * we need to remove all the keys from the slots we lost. */ - for (j = 0; j < dirty_slots_count; j++) - delKeysInSlot(dirty_slots[j]); - } -} - -/* Cluster ping extensions. - * - * The ping/pong/meet messages support arbitrary extensions to add additional - * metadata to the messages that are sent between the various nodes in the - * cluster. The extensions take the form: - * [ Header length + type (8 bytes) ] - * [ Extension information (Arbitrary length, but must be 8 byte padded) ] - */ - - -/* Returns the length of a given extension */ -static uint32_t getPingExtLength(clusterMsgPingExt *ext) { - return ntohl(ext->length); -} - -/* Returns the initial position of ping extensions. May return an invalid - * address if there are no ping extensions. */ -static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) { - clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]); - return initial; -} - -/* Given a current ping extension, returns the start of the next extension. May return - * an invalid address if there are no further ping extensions. */ -static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) { - clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext)); - return next; -} - -/* All PING extensions must be 8-byte aligned */ -uint32_t getAlignedPingExtSize(uint32_t dataSize) { - - return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize); -} - -uint32_t getHostnamePingExtSize(void) { - if (sdslen(myself->hostname) == 0) { - return 0; - } - return getAlignedPingExtSize(sdslen(myself->hostname) + 1); -} - -uint32_t getHumanNodenamePingExtSize(void) { - if (sdslen(myself->human_nodename) == 0) { - return 0; - } - return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1); -} - -uint32_t getShardIdPingExtSize(void) { - return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId)); -} - -uint32_t getForgottenNodeExtSize(void) { - return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode)); -} - -void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) { - ext->type = htons(type); - ext->length = htonl(length); - return &ext->ext[0]; -} - -clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) { - return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length)); -} - -/* 1. If a NULL hdr is provided, compute the extension size; - * 2. If a non-NULL hdr is provided, write the hostname ping - * extension at the start of the cursor. This function - * will update the cursor to point to the end of the - * written extension and will return the amount of bytes - * written. */ -uint32_t writePingExt(clusterMsg *hdr, int gossipcount) { - uint16_t extensions = 0; - uint32_t totlen = 0; - clusterMsgPingExt *cursor = NULL; - /* Set the initial extension position */ - if (hdr != NULL) { - cursor = getInitialPingExt(hdr, gossipcount); - } - - /* hostname is optional */ - if (sdslen(myself->hostname) != 0) { - if (cursor != NULL) { - /* Populate hostname */ - clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize()); - memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname)); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - - totlen += getHostnamePingExtSize(); - extensions++; - } - - if (sdslen(myself->human_nodename) != 0) { - if (cursor != NULL) { - /* Populate human_nodename */ - clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize()); - memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename)); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - - totlen += getHumanNodenamePingExtSize(); - extensions++; - } - - /* Gossip forgotten nodes */ - if (dictSize(server.cluster->nodes_black_list) > 0) { - dictIterator *di = dictGetIterator(server.cluster->nodes_black_list); - dictEntry *de; - while ((de = dictNext(di)) != NULL) { - if (cursor != NULL) { - uint64_t expire = dictGetUnsignedIntegerVal(de); - if ((time_t)expire < server.unixtime) continue; /* already expired */ - uint64_t ttl = expire - server.unixtime; - clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize()); - memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN); - ext->ttl = htonu64(ttl); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - totlen += getForgottenNodeExtSize(); - extensions++; - } - dictReleaseIterator(di); - } - - /* Populate shard_id */ - if (cursor != NULL) { - clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize()); - memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - totlen += getShardIdPingExtSize(); - extensions++; - - if (hdr != NULL) { - if (extensions != 0) { - hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA; - } - hdr->extensions = htons(extensions); - } - - return totlen; -} - -/* We previously validated the extensions, so this function just needs to - * handle the extensions. */ -void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { - clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - char *ext_hostname = NULL; - char *ext_humannodename = NULL; - char *ext_shardid = NULL; - uint16_t extensions = ntohs(hdr->extensions); - /* Loop through all the extensions and process them */ - clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count)); - while (extensions--) { - uint16_t type = ntohs(ext->type); - if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) { - clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname); - ext_hostname = hostname_ext->hostname; - } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) { - clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename); - ext_humannodename = humannodename_ext->human_nodename; - } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) { - clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node); - clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN); - if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) { - sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); - dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id); - uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); - dictSetUnsignedIntegerVal(de, expire); - clusterDelNode(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - } - } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) { - clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id); - ext_shardid = shardid_ext->shard_id; - } else { - /* Unknown type, we will ignore it but log what happened. */ - serverLog(LL_WARNING, "Received unknown extension type %d", type); - } - - /* We know this will be valid since we validated it ahead of time */ - ext = getNextPingExt(ext); - } - /* If the node did not send us a hostname extension, assume - * they don't have an announced hostname. Otherwise, we'll - * set it now. */ - updateAnnouncedHostname(sender, ext_hostname); - updateAnnouncedHumanNodename(sender, ext_humannodename); - updateShardId(sender, ext_shardid); -} - -static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) { - clusterNode *sender; - if (link->node && !nodeInHandshake(link->node)) { - /* If the link has an associated node, use that so that we don't have to look it - * up every time, except when the node is still in handshake, the node still has - * a random name thus not truly "known". */ - sender = link->node; - } else { - /* Otherwise, fetch sender based on the message */ - sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - /* We know the sender node but haven't associate it with the link. This must - * be an inbound link because only for inbound links we didn't know which node - * to associate when they were created. */ - if (sender && !link->node) { - setClusterNodeToInboundClusterLink(sender, link); - } - } - return sender; -} - -/* When this function is called, there is a packet to process starting - * at link->rcvbuf. Releasing the buffer is up to the caller, so this - * function should just handle the higher level stuff of processing the - * packet, modifying the cluster state if needed. - * - * The function returns 1 if the link is still valid after the packet - * was processed, otherwise 0 if the link was freed since the packet - * processing lead to some inconsistency error (for instance a PONG - * received from the wrong sender ID). */ -int clusterProcessPacket(clusterLink *link) { - clusterMsg *hdr = (clusterMsg*) link->rcvbuf; - uint32_t totlen = ntohl(hdr->totlen); - uint16_t type = ntohs(hdr->type); - mstime_t now = mstime(); - - if (type < CLUSTERMSG_TYPE_COUNT) - server.cluster->stats_bus_messages_received[type]++; - serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes", - clusterGetMessageTypeString(type), (unsigned long) totlen); - - /* Perform sanity checks */ - if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ - if (totlen > link->rcvbuf_len) return 1; - - if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) { - /* Can't handle messages of different versions. */ - return 1; - } - - if (type == server.cluster_drop_packet_filter) { - serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); - return 1; - } - - uint16_t flags = ntohs(hdr->flags); - uint16_t extensions = ntohs(hdr->extensions); - uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0; - uint32_t explen; /* expected length of this packet */ - clusterNode *sender; - - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || - type == CLUSTERMSG_TYPE_MEET) - { - uint16_t count = ntohs(hdr->count); - - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += (sizeof(clusterMsgDataGossip)*count); - - /* If there is extension data, which doesn't have a fixed length, - * loop through them and validate the length of it now. */ - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - clusterMsgPingExt *ext = getInitialPingExt(hdr, count); - while (extensions--) { - uint16_t extlen = getPingExtLength(ext); - if (extlen % 8 != 0) { - serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)", - clusterGetMessageTypeString(type), (int) extlen); - return 1; - } - if ((totlen - explen) < extlen) { - serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds " - "total packet length (%lld)", clusterGetMessageTypeString(type), - (unsigned long long) totlen); - return 1; - } - explen += extlen; - ext = getNextPingExt(ext); - } - } - } else if (type == CLUSTERMSG_TYPE_FAIL) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataFail); - } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataPublish) - - 8 + - ntohl(hdr->data.publish.msg.channel_len) + - ntohl(hdr->data.publish.msg.message_len); - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || - type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || - type == CLUSTERMSG_TYPE_MFSTART) - { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - } else if (type == CLUSTERMSG_TYPE_UPDATE) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataUpdate); - } else if (type == CLUSTERMSG_TYPE_MODULE) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgModule) - - 3 + ntohl(hdr->data.module.msg.len); - } else { - /* We don't know this type of packet, so we assume it's well formed. */ - explen = totlen; - } - - if (totlen != explen) { - serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld", - clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen); - return 1; - } - - sender = getNodeFromLinkAndMsg(link, hdr); - - /* Update the last time we saw any data from this node. We - * use this in order to avoid detecting a timeout from a node that - * is just sending a lot of data in the cluster bus, for instance - * because of Pub/Sub. */ - if (sender) sender->data_received = now; - - if (sender && !nodeInHandshake(sender)) { - /* Update our currentEpoch if we see a newer epoch in the cluster. */ - senderCurrentEpoch = ntohu64(hdr->currentEpoch); - senderConfigEpoch = ntohu64(hdr->configEpoch); - if (senderCurrentEpoch > server.cluster->currentEpoch) - server.cluster->currentEpoch = senderCurrentEpoch; - /* Update the sender configEpoch if it is publishing a newer one. */ - if (senderConfigEpoch > sender->configEpoch) { - sender->configEpoch = senderConfigEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - } - /* Update the replication offset info for this node. */ - sender->repl_offset = ntohu64(hdr->offset); - sender->repl_offset_time = now; - /* If we are a slave performing a manual failover and our master - * sent its offset while already paused, populate the MF state. */ - if (server.cluster->mf_end && - nodeIsSlave(myself) && - myself->slaveof == sender && - hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && - server.cluster->mf_master_offset == -1) - { - server.cluster->mf_master_offset = sender->repl_offset; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); - serverLog(LL_NOTICE, - "Received replication offset for paused " - "master manual failover: %lld", - server.cluster->mf_master_offset); - } - } - - /* Initial processing of PING and MEET requests replying with a PONG. */ - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { - /* We use incoming MEET messages in order to set the address - * for 'myself', since only other cluster nodes will send us - * MEET messages on handshakes, when the cluster joins, or - * later if we changed address, and those nodes will use our - * official address to connect to us. So by obtaining this address - * from the socket is a simple way to discover / update our own - * address in the cluster without it being hardcoded in the config. - * - * However if we don't have an address at all, we update the address - * even with a normal PING packet. If it's wrong it will be fixed - * by MEET later. */ - if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') && - server.cluster_announce_ip == NULL) - { - char ip[NET_IP_STR_LEN]; - - if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 && - strcmp(ip,myself->ip)) - { - memcpy(myself->ip,ip,NET_IP_STR_LEN); - serverLog(LL_NOTICE,"IP address for this node updated to %s", - myself->ip); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - } - - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, slaveof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) { - clusterNode *node; - - node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - - /* If this is a MEET packet from an unknown node, we still process - * the gossip section here since we have to trust the sender because - * of the message type. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) - clusterProcessGossipSection(hdr,link); - - /* Anyway reply with a PONG */ - clusterSendPing(link,CLUSTERMSG_TYPE_PONG); - } - - /* PING, PONG, MEET: process config information. */ - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || - type == CLUSTERMSG_TYPE_MEET) - { - serverLog(LL_DEBUG,"%s packet received: %.40s", - clusterGetMessageTypeString(type), - link->node ? link->node->name : "NULL"); - if (!link->inbound) { - if (nodeInHandshake(link->node)) { - /* If we already have this node, try to change the - * IP/port of the node with the new one. */ - if (sender) { - serverLog(LL_VERBOSE, - "Handshake: we already know node %.40s (%s), " - "updating the address if needed.", sender->name, sender->human_nodename); - if (nodeUpdateAddressIfNeeded(sender,link,hdr)) - { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - /* Free this node as we already have it. This will - * cause the link to be freed as well. */ - clusterDelNode(link->node); - return 0; - } - - /* First thing to do is replacing the random name with the - * right node name if this was a handshake stage. */ - clusterRenameNode(link->node, hdr->sender); - serverLog(LL_DEBUG,"Handshake with node %.40s completed.", - link->node->name); - link->node->flags &= ~CLUSTER_NODE_HANDSHAKE; - link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } else if (memcmp(link->node->name,hdr->sender, - CLUSTER_NAMELEN) != 0) - { - /* If the reply has a non matching node ID we - * disconnect this node and set it as not having an associated - * address. */ - serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d", - link->node->name, - (int)(now-(link->node->ctime)), - link->node->flags); - link->node->flags |= CLUSTER_NODE_NOADDR; - link->node->ip[0] = '\0'; - link->node->tcp_port = 0; - link->node->tls_port = 0; - link->node->cport = 0; - freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - return 0; - } - } - - /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender - * announced. This is a dynamic flag that we receive from the - * sender, and the latest status must be trusted. We need it to - * be propagated because the slave ranking used to understand the - * delay of each slave in the voting process, needs to know - * what are the instances really competing. */ - if (sender) { - int nofailover = flags & CLUSTER_NODE_NOFAILOVER; - sender->flags &= ~CLUSTER_NODE_NOFAILOVER; - sender->flags |= nofailover; - } - - /* Update the node address if it changed. */ - if (sender && type == CLUSTERMSG_TYPE_PING && - !nodeInHandshake(sender) && - nodeUpdateAddressIfNeeded(sender,link,hdr)) - { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - - /* Update our info about the node */ - if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) { - link->node->pong_received = now; - link->node->ping_sent = 0; - - /* The PFAIL condition can be reversed without external - * help if it is momentary (that is, if it does not - * turn into a FAIL state). - * - * The FAIL condition is also reversible under specific - * conditions detected by clearNodeFailureIfNeeded(). */ - if (nodeTimedOut(link->node)) { - link->node->flags &= ~CLUSTER_NODE_PFAIL; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } else if (nodeFailed(link->node)) { - clearNodeFailureIfNeeded(link->node); - } - } - - /* Check for role switch: slave -> master or master -> slave. */ - if (sender) { - if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME, - sizeof(hdr->slaveof))) - { - /* Node is a master. */ - clusterSetNodeAsMaster(sender); - } else { - /* Node is a slave. */ - clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN); - - if (nodeIsMaster(sender)) { - /* Master turned into a slave! Reconfigure the node. */ - clusterDelNodeSlots(sender); - sender->flags &= ~(CLUSTER_NODE_MASTER| - CLUSTER_NODE_MIGRATE_TO); - sender->flags |= CLUSTER_NODE_SLAVE; - - /* Update config and state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - - /* Master node changed for this slave? */ - if (master && sender->slaveof != master) { - if (sender->slaveof) - clusterNodeRemoveSlave(sender->slaveof,sender); - clusterNodeAddSlave(master,sender); - sender->slaveof = master; - - /* Update config. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - } - } - - /* Update our info about served slots. - * - * Note: this MUST happen after we update the master/slave state - * so that CLUSTER_NODE_MASTER flag will be set. */ - - /* Many checks are only needed if the set of served slots this - * instance claims is different compared to the set of slots we have - * for it. Check this ASAP to avoid other computational expansive - * checks later. */ - clusterNode *sender_master = NULL; /* Sender or its master if slave. */ - int dirty_slots = 0; /* Sender claimed slots don't match my view? */ - - if (sender) { - sender_master = nodeIsMaster(sender) ? sender : sender->slaveof; - if (sender_master) { - dirty_slots = memcmp(sender_master->slots, - hdr->myslots,sizeof(hdr->myslots)) != 0; - } - } - - /* 1) If the sender of the message is a master, and we detected that - * the set of slots it claims changed, scan the slots to see if we - * need to update our configuration. */ - if (sender && nodeIsMaster(sender) && dirty_slots) - clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); - - /* 2) We also check for the reverse condition, that is, the sender - * claims to serve slots we know are served by a master with a - * greater configEpoch. If this happens we inform the sender. - * - * This is useful because sometimes after a partition heals, a - * reappearing master may be the last one to claim a given set of - * hash slots, but with a configuration that other instances know to - * be deprecated. Example: - * - * A and B are master and slave for slots 1,2,3. - * A is partitioned away, B gets promoted. - * B is partitioned away, and A returns available. - * - * Usually B would PING A publishing its set of served slots and its - * configEpoch, but because of the partition B can't inform A of the - * new configuration, so other nodes that have an updated table must - * do it. In this way A will stop to act as a master (or can try to - * failover if there are the conditions to win the election). */ - if (sender && dirty_slots) { - int j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(hdr->myslots,j)) { - if (server.cluster->slots[j] == sender || - isSlotUnclaimed(j)) continue; - if (server.cluster->slots[j]->configEpoch > - senderConfigEpoch) - { - serverLog(LL_VERBOSE, - "Node %.40s has old slots configuration, sending " - "an UPDATE message about %.40s", - sender->name, server.cluster->slots[j]->name); - clusterSendUpdate(sender->link, - server.cluster->slots[j]); - - /* TODO: instead of exiting the loop send every other - * UPDATE packet for other nodes that are the new owner - * of sender's slots. */ - break; - } - } - } - } - - /* If our config epoch collides with the sender's try to fix - * the problem. */ - if (sender && - nodeIsMaster(myself) && nodeIsMaster(sender) && - senderConfigEpoch == myself->configEpoch) - { - clusterHandleConfigEpochCollision(sender); - } - - /* Get info from the gossip section */ - if (sender) { - clusterProcessGossipSection(hdr,link); - clusterProcessPingExtensions(hdr,link); - } - } else if (type == CLUSTERMSG_TYPE_FAIL) { - clusterNode *failing; - - if (sender) { - failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN); - if (failing && - !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF))) - { - serverLog(LL_NOTICE, - "FAIL message received from %.40s (%s) about %.40s (%s)", - hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename); - failing->flags |= CLUSTER_NODE_FAIL; - failing->fail_time = now; - failing->flags &= ~CLUSTER_NODE_PFAIL; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - } else { - serverLog(LL_NOTICE, - "Ignoring FAIL message from unknown node %.40s about %.40s", - hdr->sender, hdr->data.fail.about.nodename); - } - } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { - if (!sender) return 1; /* We don't know that node. */ - - robj *channel, *message; - uint32_t channel_len, message_len; - - /* Don't bother creating useless objects if there are no - * Pub/Sub subscribers. */ - if ((type == CLUSTERMSG_TYPE_PUBLISH - && serverPubsubSubscriptionCount() > 0) - || (type == CLUSTERMSG_TYPE_PUBLISHSHARD - && serverPubsubShardSubscriptionCount() > 0)) - { - channel_len = ntohl(hdr->data.publish.msg.channel_len); - message_len = ntohl(hdr->data.publish.msg.message_len); - channel = createStringObject( - (char*)hdr->data.publish.msg.bulk_data,channel_len); - message = createStringObject( - (char*)hdr->data.publish.msg.bulk_data+channel_len, - message_len); - pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD); - decrRefCount(channel); - decrRefCount(message); - } - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { - if (!sender) return 1; /* We don't know that node. */ - clusterSendFailoverAuthIfNeeded(sender,hdr); - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { - if (!sender) return 1; /* We don't know that node. */ - /* We consider this vote only if the sender is a master serving - * a non zero number of slots, and its currentEpoch is greater or - * equal to epoch where this node started the election. */ - if (nodeIsMaster(sender) && sender->numslots > 0 && - senderCurrentEpoch >= server.cluster->failover_auth_epoch) - { - server.cluster->failover_auth_count++; - /* Maybe we reached a quorum here, set a flag to make sure - * we check ASAP. */ - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - } - } else if (type == CLUSTERMSG_TYPE_MFSTART) { - /* This message is acceptable only if I'm a master and the sender - * is one of my slaves. */ - if (!sender || sender->slaveof != myself) return 1; - /* Manual failover requested from slaves. Initialize the state - * accordingly. */ - resetManualFailover(); - server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; - server.cluster->mf_slave = sender; - pauseActions(PAUSE_DURING_FAILOVER, - now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), - PAUSE_ACTIONS_CLIENT_WRITE_SET); - serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).", - sender->name, sender->human_nodename); - /* We need to send a ping message to the replica, as it would carry - * `server.cluster->mf_master_offset`, which means the master paused clients - * at offset `server.cluster->mf_master_offset`, so that the replica would - * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as - * to complete failover as quickly as possible. */ - clusterSendPing(link, CLUSTERMSG_TYPE_PING); - } else if (type == CLUSTERMSG_TYPE_UPDATE) { - clusterNode *n; /* The node the update is about. */ - uint64_t reportedConfigEpoch = - ntohu64(hdr->data.update.nodecfg.configEpoch); - - if (!sender) return 1; /* We don't know the sender. */ - n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN); - if (!n) return 1; /* We don't know the reported node. */ - if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */ - - /* If in our current config the node is a slave, set it as a master. */ - if (nodeIsSlave(n)) clusterSetNodeAsMaster(n); - - /* Update the node's configEpoch. */ - n->configEpoch = reportedConfigEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - - /* Check the bitmap of served slots and update our - * config accordingly. */ - clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, - hdr->data.update.nodecfg.slots); - } else if (type == CLUSTERMSG_TYPE_MODULE) { - if (!sender) return 1; /* Protect the module from unknown nodes. */ - /* We need to route this message back to the right module subscribed - * for the right message type. */ - uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */ - uint32_t len = ntohl(hdr->data.module.msg.len); - uint8_t type = hdr->data.module.msg.type; - unsigned char *payload = hdr->data.module.msg.bulk_data; - moduleCallClusterReceivers(sender->name,module_id,type,payload,len); - } else { - serverLog(LL_WARNING,"Received unknown packet type: %d", type); - } - return 1; -} - -/* This function is called when we detect the link with this node is lost. - We set the node as no longer connected. The Cluster Cron will detect - this connection and will try to get it connected again. - - Instead if the node is a temporary node used to accept a query, we - completely free the node on error. */ -void handleLinkIOError(clusterLink *link) { - freeClusterLink(link); -} - -/* Send the messages queued for the link. */ -void clusterWriteHandler(connection *conn) { - clusterLink *link = connGetPrivateData(conn); - ssize_t nwritten; - size_t totwritten = 0; - - while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { - listNode *head = listFirst(link->send_msg_queue); - clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value; - clusterMsg *msg = &msgblock->msg; - size_t msg_offset = link->head_msg_send_offset; - size_t msg_len = ntohl(msg->totlen); - - nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset); - if (nwritten <= 0) { - serverLog(LL_DEBUG,"I/O error writing to node link: %s", - (nwritten == -1) ? connGetLastError(conn) : "short write"); - handleLinkIOError(link); - return; - } - if (msg_offset + nwritten < msg_len) { - /* If full message wasn't written, record the offset - * and continue sending from this point next time */ - link->head_msg_send_offset += nwritten; - return; - } - serverAssert((msg_offset + nwritten) == msg_len); - link->head_msg_send_offset = 0; - - /* Delete the node and update our memory tracking */ - uint32_t blocklen = msgblock->totlen; - listDelNode(link->send_msg_queue, head); - server.stat_cluster_links_memory -= sizeof(listNode); - link->send_msg_queue_mem -= sizeof(listNode) + blocklen; - - totwritten += nwritten; - } - - if (listLength(link->send_msg_queue) == 0) - connSetWriteHandler(link->conn, NULL); -} - -/* A connect handler that gets called when a connection to another node - * gets established. - */ -void clusterLinkConnectHandler(connection *conn) { - clusterLink *link = connGetPrivateData(conn); - clusterNode *node = link->node; - - /* Check if connection succeeded */ - if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s", - node->name, node->ip, node->cport, - connGetLastError(conn)); - freeClusterLink(link); - return; - } - - /* Register a read handler from now on */ - connSetReadHandler(conn, clusterReadHandler); - - /* Queue a PING in the new connection ASAP: this is crucial - * to avoid false positives in failure detection. - * - * If the node is flagged as MEET, we send a MEET message instead - * of a PING one, to force the receiver to add us in its node - * table. */ - mstime_t old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? - CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); - if (old_ping_sent) { - /* If there was an active ping before the link was - * disconnected, we want to restore the ping time, otherwise - * replaced by the clusterSendPing() call. */ - node->ping_sent = old_ping_sent; - } - /* We can clear the flag after the first packet is sent. - * If we'll never receive a PONG, we'll never send new packets - * to this node. Instead after the PONG is received and we - * are no longer in meet/handshake status, we want to send - * normal PING packets. */ - node->flags &= ~CLUSTER_NODE_MEET; - - serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", - node->name, node->ip, node->cport); -} - -/* Read data. Try to read the first field of the header first to check the - * full length of the packet. When a whole packet is in memory this function - * will call the function to process the packet. And so forth. */ -void clusterReadHandler(connection *conn) { - clusterMsg buf[1]; - ssize_t nread; - clusterMsg *hdr; - clusterLink *link = connGetPrivateData(conn); - unsigned int readlen, rcvbuflen; - - while(1) { /* Read as long as there is data to read. */ - rcvbuflen = link->rcvbuf_len; - if (rcvbuflen < 8) { - /* First, obtain the first 8 bytes to get the full message - * length. */ - readlen = 8 - rcvbuflen; - } else { - /* Finally read the full message. */ - hdr = (clusterMsg*) link->rcvbuf; - if (rcvbuflen == 8) { - /* Perform some sanity check on the message signature - * and length. */ - if (memcmp(hdr->sig,"RCmb",4) != 0 || - ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) - { - char ip[NET_IP_STR_LEN]; - int port; - if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) { - serverLog(LL_WARNING, - "Bad message length or signature received " - "on the Cluster bus."); - } else { - serverLog(LL_WARNING, - "Bad message length or signature received " - "on the Cluster bus from %s:%d", ip, port); - } - handleLinkIOError(link); - return; - } - } - readlen = ntohl(hdr->totlen) - rcvbuflen; - if (readlen > sizeof(buf)) readlen = sizeof(buf); - } - - nread = connRead(conn,buf,readlen); - if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */ - - if (nread <= 0) { - /* I/O error... */ - serverLog(LL_DEBUG,"I/O error reading from node link: %s", - (nread == 0) ? "connection closed" : connGetLastError(conn)); - handleLinkIOError(link); - return; - } else { - /* Read data and recast the pointer to the new buffer. */ - size_t unused = link->rcvbuf_alloc - link->rcvbuf_len; - if ((size_t)nread > unused) { - size_t required = link->rcvbuf_len + nread; - size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; - /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */ - link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC; - link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc); - server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; - } - memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread); - link->rcvbuf_len += nread; - hdr = (clusterMsg*) link->rcvbuf; - rcvbuflen += nread; - } - - /* Total length obtained? Process this packet. */ - if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) { - if (clusterProcessPacket(link)) { - if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) { - size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; - zfree(link->rcvbuf); - link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); - server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; - } - link->rcvbuf_len = 0; - } else { - return; /* Link no longer valid. */ - } - } - } -} - -/* Put the message block into the link's send queue. - * - * It is guaranteed that this function will never have as a side effect - * the link to be invalidated, so it is safe to call this function - * from event handlers that will do stuff with the same link later. */ -void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { - if (!link) { - return; - } - if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) - connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); - - listAddNodeTail(link->send_msg_queue, msgblock); - msgblock->refcount++; - - /* Update memory tracking */ - link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen; - server.stat_cluster_links_memory += sizeof(listNode); - - /* Populate sent messages stats. */ - uint16_t type = ntohs(msgblock->msg.type); - if (type < CLUSTERMSG_TYPE_COUNT) - server.cluster->stats_bus_messages_sent[type]++; -} - -/* Send a message to all the nodes that are part of the cluster having - * a connected link. - * - * It is guaranteed that this function will never have as a side effect - * some node->link to be invalidated, so it is safe to call this function - * from event handlers that will do stuff with node links later. */ -void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - clusterSendMessage(node->link,msgblock); - } - dictReleaseIterator(di); -} - -/* Build the message header. hdr must point to a buffer at least - * sizeof(clusterMsg) in bytes. */ -static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) { - uint64_t offset; - clusterNode *master; - - /* If this node is a master, we send its slots bitmap and configEpoch. - * If this node is a slave we send the master's information instead (the - * node is flagged as slave so the receiver knows that it is NOT really - * in charge for this slots. */ - master = (nodeIsSlave(myself) && myself->slaveof) ? - myself->slaveof : myself; - - hdr->ver = htons(CLUSTER_PROTO_VER); - hdr->sig[0] = 'R'; - hdr->sig[1] = 'C'; - hdr->sig[2] = 'm'; - hdr->sig[3] = 'b'; - hdr->type = htons(type); - memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN); - - /* If cluster-announce-ip option is enabled, force the receivers of our - * packets to use the specified address for this node. Otherwise if the - * first byte is zero, they'll do auto discovery. */ - memset(hdr->myip,0,NET_IP_STR_LEN); - if (server.cluster_announce_ip) { - redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN); - } - - /* Handle cluster-announce-[tls-|bus-]port. */ - int announced_tcp_port, announced_tls_port, announced_cport; - deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport); - - memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); - memset(hdr->slaveof,0,CLUSTER_NAMELEN); - if (myself->slaveof != NULL) - memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN); - if (server.tls_cluster) { - hdr->port = htons(announced_tls_port); - hdr->pport = htons(announced_tcp_port); - } else { - hdr->port = htons(announced_tcp_port); - hdr->pport = htons(announced_tls_port); - } - hdr->cport = htons(announced_cport); - hdr->flags = htons(myself->flags); - hdr->state = server.cluster->state; - - /* Set the currentEpoch and configEpochs. */ - hdr->currentEpoch = htonu64(server.cluster->currentEpoch); - hdr->configEpoch = htonu64(master->configEpoch); - - /* Set the replication offset. */ - if (nodeIsSlave(myself)) - offset = replicationGetSlaveOffset(); - else - offset = server.master_repl_offset; - hdr->offset = htonu64(offset); - - /* Set the message flags. */ - if (nodeIsMaster(myself) && server.cluster->mf_end) - hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; - - hdr->totlen = htonl(msglen); -} - -/* Set the i-th entry of the gossip section in the message pointed by 'hdr' - * to the info of the specified node 'n'. */ -void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { - clusterMsgDataGossip *gossip; - gossip = &(hdr->data.ping.gossip[i]); - memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN); - gossip->ping_sent = htonl(n->ping_sent/1000); - gossip->pong_received = htonl(n->pong_received/1000); - memcpy(gossip->ip,n->ip,sizeof(n->ip)); - if (server.tls_cluster) { - gossip->port = htons(n->tls_port); - gossip->pport = htons(n->tcp_port); - } else { - gossip->port = htons(n->tcp_port); - gossip->pport = htons(n->tls_port); - } - gossip->cport = htons(n->cport); - gossip->flags = htons(n->flags); - gossip->notused1 = 0; -} - -/* Send a PING or PONG packet to the specified node, making sure to add enough - * gossip information. */ -void clusterSendPing(clusterLink *link, int type) { - static unsigned long long cluster_pings_sent = 0; - cluster_pings_sent++; - int gossipcount = 0; /* Number of gossip sections added so far. */ - int wanted; /* Number of gossip sections we want to append if possible. */ - int estlen; /* Upper bound on estimated packet length */ - /* freshnodes is the max number of nodes we can hope to append at all: - * nodes available minus two (ourself and the node we are sending the - * message to). However practically there may be less valid nodes since - * nodes in handshake state, disconnected, are not considered. */ - int freshnodes = dictSize(server.cluster->nodes)-2; - - /* How many gossip sections we want to add? 1/10 of the number of nodes - * and anyway at least 3. Why 1/10? - * - * If we have N masters, with N/10 entries, and we consider that in - * node_timeout we exchange with each other node at least 4 packets - * (we ping in the worst case in node_timeout/2 time, and we also - * receive two pings from the host), we have a total of 8 packets - * in the node_timeout*2 failure reports validity time. So we have - * that, for a single PFAIL node, we can expect to receive the following - * number of failure reports (in the specified window of time): - * - * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: - * - * PROB = probability of being featured in a single gossip entry, - * which is 1 / NUM_OF_NODES. - * ENTRIES = 10. - * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. - * - * If we assume we have just masters (so num of nodes and num of masters - * is the same), with 1/10 we always get over the majority, and specifically - * 80% of the number of nodes, to account for many masters failing at the - * same time. - * - * Since we have non-voting slaves that lower the probability of an entry - * to feature our node, we set the number of entries per packet as - * 10% of the total nodes we have. */ - wanted = floor(dictSize(server.cluster->nodes)/10); - if (wanted < 3) wanted = 3; - if (wanted > freshnodes) wanted = freshnodes; - - /* Include all the nodes in PFAIL state, so that failure reports are - * faster to propagate to go from PFAIL to FAIL state. */ - int pfail_wanted = server.cluster->stats_pfail_nodes; - - /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen - * later according to the number of gossip sections we really were able - * to put inside the packet. */ - estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData); - estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted)); - estlen += writePingExt(NULL, 0); - /* Note: clusterBuildMessageHdr() expects the buffer to be always at least - * sizeof(clusterMsg) or more. */ - if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); - clusterMsg *hdr = &msgblock->msg; - - if (!link->inbound && type == CLUSTERMSG_TYPE_PING) - link->node->ping_sent = mstime(); - - /* Populate the gossip fields */ - int maxiterations = wanted*3; - while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { - dictEntry *de = dictGetRandomKey(server.cluster->nodes); - clusterNode *this = dictGetVal(de); - - /* Don't include this node: the whole packet header is about us - * already, so we just gossip about other nodes. */ - if (this == myself) continue; - - /* PFAIL nodes will be added later. */ - if (this->flags & CLUSTER_NODE_PFAIL) continue; - - /* In the gossip section don't include: - * 1) Nodes in HANDSHAKE state. - * 3) Nodes with the NOADDR flag set. - * 4) Disconnected nodes if they don't have configured slots. - */ - if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) || - (this->link == NULL && this->numslots == 0)) - { - freshnodes--; /* Technically not correct, but saves CPU. */ - continue; - } - - /* Do not add a node we already have. */ - if (this->last_in_ping_gossip == cluster_pings_sent) continue; - - /* Add it */ - clusterSetGossipEntry(hdr,gossipcount,this); - this->last_in_ping_gossip = cluster_pings_sent; - freshnodes--; - gossipcount++; - } - - /* If there are PFAIL nodes, add them at the end. */ - if (pfail_wanted) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL && pfail_wanted > 0) { - clusterNode *node = dictGetVal(de); - if (node->flags & CLUSTER_NODE_HANDSHAKE) continue; - if (node->flags & CLUSTER_NODE_NOADDR) continue; - if (!(node->flags & CLUSTER_NODE_PFAIL)) continue; - clusterSetGossipEntry(hdr,gossipcount,node); - gossipcount++; - /* We take the count of the slots we allocated, since the - * PFAIL stats may not match perfectly with the current number - * of PFAIL nodes. */ - pfail_wanted--; - } - dictReleaseIterator(di); - } - - /* Compute the actual total length and send! */ - uint32_t totlen = 0; - totlen += writePingExt(hdr, gossipcount); - totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData); - totlen += (sizeof(clusterMsgDataGossip)*gossipcount); - serverAssert(gossipcount < USHRT_MAX); - hdr->count = htons(gossipcount); - hdr->totlen = htonl(totlen); - - clusterSendMessage(link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a PONG packet to every connected node that's not in handshake state - * and for which we have a valid link. - * - * In Redis Cluster pongs are not used just for failure detection, but also - * to carry important configuration information. So broadcasting a pong is - * useful when something changes in the configuration and we want to make - * the cluster aware ASAP (for instance after a slave promotion). - * - * The 'target' argument specifies the receiving instances using the - * defines below: - * - * CLUSTER_BROADCAST_ALL -> All known instances. - * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring. - */ -#define CLUSTER_BROADCAST_ALL 0 -#define CLUSTER_BROADCAST_LOCAL_SLAVES 1 -void clusterBroadcastPong(int target) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (!node->link) continue; - if (node == myself || nodeInHandshake(node)) continue; - if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) { - int local_slave = - nodeIsSlave(node) && node->slaveof && - (node->slaveof == myself || node->slaveof == myself->slaveof); - if (!local_slave) continue; - } - clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); - } - dictReleaseIterator(di); -} - -/* Create a PUBLISH message block. - * - * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8. - * As all the struct is used as a buffer, when more than 8 bytes are copied into - * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false - * positive in this context. */ -REDIS_NO_SANITIZE("bounds") -clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) { - - uint32_t channel_len, message_len; - - channel = getDecodedObject(channel); - message = getDecodedObject(message); - channel_len = sdslen(channel->ptr); - message_len = sdslen(message->ptr); - - size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len; - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); - - clusterMsg *hdr = &msgblock->msg; - hdr->data.publish.msg.channel_len = htonl(channel_len); - hdr->data.publish.msg.message_len = htonl(message_len); - memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); - memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), - message->ptr,sdslen(message->ptr)); - - decrRefCount(channel); - decrRefCount(message); - - return msgblock; -} - -/* Send a FAIL message to all the nodes we are able to contact. - * The FAIL message is sent when we detect that a node is failing - * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this: - * we switch the node state to CLUSTER_NODE_FAIL and ask all the other - * nodes to do the same ASAP. */ -void clusterSendFail(char *nodename) { - uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) - + sizeof(clusterMsgDataFail); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); - - clusterMsg *hdr = &msgblock->msg; - memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN); - - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send an UPDATE message to the specified link carrying the specified 'node' - * slots configuration. The node name, slots bitmap, and configEpoch info - * are included. */ -void clusterSendUpdate(clusterLink *link, clusterNode *node) { - if (link == NULL) return; - - uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) - + sizeof(clusterMsgDataUpdate); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); - - clusterMsg *hdr = &msgblock->msg; - memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN); - hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); - memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots)); - for (unsigned int i = 0; i < sizeof(node->slots); i++) { - /* Don't advertise slots that the node stopped claiming */ - hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]); - } - - clusterSendMessage(link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a MODULE message. - * - * If link is NULL, then the message is broadcasted to the whole cluster. */ -void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, - const char *payload, uint32_t len) { - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - msglen += sizeof(clusterMsgModule) - 3 + len; - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); - - clusterMsg *hdr = &msgblock->msg; - hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ - hdr->data.module.msg.type = type; - hdr->data.module.msg.len = htonl(len); - memcpy(hdr->data.module.msg.bulk_data,payload,len); - - if (link) - clusterSendMessage(link,msgblock); - else - clusterBroadcastMessage(msgblock); - - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* This function gets a cluster node ID string as target, the same way the nodes - * addresses are represented in the modules side, resolves the node, and sends - * the message. If the target is NULL the message is broadcasted. - * - * The function returns C_OK if the target is valid, otherwise C_ERR is - * returned. */ -int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) { - clusterNode *node = NULL; - - if (target != NULL) { - node = clusterLookupNode(target, strlen(target)); - if (node == NULL || node->link == NULL) return C_ERR; - } - - clusterSendModule(target ? node->link : NULL, - module_id, type, payload, len); - return C_OK; -} - -/* ----------------------------------------------------------------------------- - * CLUSTER Pub/Sub support - * - * If `sharded` is 0: - * For now we do very little, just propagating [S]PUBLISH messages across the whole - * cluster. In the future we'll try to get smarter and avoiding propagating those - * messages to hosts without receives for a given channel. - * Otherwise: - * Publish this message across the slot (primary/replica). - * -------------------------------------------------------------------------- */ -void clusterPropagatePublish(robj *channel, robj *message, int sharded) { - clusterMsgSendBlock *msgblock; - - if (!sharded) { - msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH); - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); - return; - } - - listIter li; - listNode *ln; - list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself); - serverAssert(nodes_for_slot != NULL); - listRewind(nodes_for_slot, &li); - msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD); - while((ln = listNext(&li))) { - clusterNode *node = listNodeValue(ln); - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - clusterSendMessage(node->link,msgblock); - } - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* ----------------------------------------------------------------------------- - * SLAVE node specific functions - * -------------------------------------------------------------------------- */ - -/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to - * see if there is the quorum for this slave instance to failover its failing - * master. - * - * Note that we send the failover request to everybody, master and slave nodes, - * but only the masters are supposed to reply to our query. */ -void clusterRequestFailoverAuth(void) { - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); - - clusterMsg *hdr = &msgblock->msg; - /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit - * in the header to communicate the nodes receiving the message that - * they should authorized the failover even if the master is working. */ - if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a FAILOVER_AUTH_ACK message to the specified node. */ -void clusterSendFailoverAuth(clusterNode *node) { - if (!node->link) return; - - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen); - - clusterSendMessage(node->link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a MFSTART message to the specified node. */ -void clusterSendMFStart(clusterNode *node) { - if (!node->link) return; - - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen); - - clusterSendMessage(node->link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Vote for the node asking for our vote if there are the conditions. */ -void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { - clusterNode *master = node->slaveof; - uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch); - uint64_t requestConfigEpoch = ntohu64(request->configEpoch); - unsigned char *claimed_slots = request->myslots; - int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK; - int j; - - /* IF we are not a master serving at least 1 slot, we don't have the - * right to vote, as the cluster size in Redis Cluster is the number - * of masters serving at least one slot, and quorum is the cluster - * size + 1 */ - if (nodeIsSlave(myself) || myself->numslots == 0) return; - - /* Request epoch must be >= our currentEpoch. - * Note that it is impossible for it to actually be greater since - * our currentEpoch was updated as a side effect of receiving this - * request, if the request epoch was greater. */ - if (requestCurrentEpoch < server.cluster->currentEpoch) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", - node->name, node->human_nodename, - (unsigned long long) requestCurrentEpoch, - (unsigned long long) server.cluster->currentEpoch); - return; - } - - /* I already voted for this epoch? Return ASAP. */ - if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): already voted for epoch %llu", - node->name, node->human_nodename, - (unsigned long long) server.cluster->currentEpoch); - return; - } - - /* Node must be a slave and its master down. - * The master can be non failing if the request is flagged - * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ - if (nodeIsMaster(node) || master == NULL || - (!nodeFailed(master) && !force_ack)) - { - if (nodeIsMaster(node)) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): it is a master node", - node->name, node->human_nodename); - } else if (master == NULL) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): I don't know its master", - node->name, node->human_nodename); - } else if (!nodeFailed(master)) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): its master is up", - node->name, node->human_nodename); - } - return; - } - - /* We did not voted for a slave about this master for two - * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) - { - serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this master before %lld milliseconds", - node->name, node->human_nodename, - (long long) ((server.cluster_node_timeout*2)- - (mstime() - node->slaveof->voted_time))); - return; - } - - /* The slave requesting the vote must have a configEpoch for the claimed - * slots that is >= the one of the masters currently serving the same - * slots in the current configuration. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(claimed_slots, j) == 0) continue; - if (isSlotUnclaimed(j) || - server.cluster->slots[j]->configEpoch <= requestConfigEpoch) - { - continue; - } - /* If we reached this point we found a slot that in our current slots - * is served by a master with a greater configEpoch than the one claimed - * by the slave requesting our vote. Refuse to vote for this slave. */ - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", - node->name, node->human_nodename, j, - (unsigned long long) server.cluster->slots[j]->configEpoch, - (unsigned long long) requestConfigEpoch); - return; - } - - /* We can vote for this slave. */ - server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->slaveof->voted_time = mstime(); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); - clusterSendFailoverAuth(node); - serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", - node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch); -} - -/* This function returns the "rank" of this instance, a slave, in the context - * of its master-slaves ring. The rank of the slave is given by the number of - * other slaves for the same master that have a better replication offset - * compared to the local one (better means, greater, so they claim more data). - * - * A slave with rank 0 is the one with the greatest (most up to date) - * replication offset, and so forth. Note that because how the rank is computed - * multiple slaves may have the same rank, in case they have the same offset. - * - * The slave rank is used to add a delay to start an election in order to - * get voted and replace a failing master. Slaves with better replication - * offsets are more likely to win. */ -int clusterGetSlaveRank(void) { - long long myoffset; - int j, rank = 0; - clusterNode *master; - - serverAssert(nodeIsSlave(myself)); - master = myself->slaveof; - if (master == NULL) return 0; /* Never called by slaves without master. */ - - myoffset = replicationGetSlaveOffset(); - for (j = 0; j < master->numslaves; j++) - if (master->slaves[j] != myself && - !nodeCantFailover(master->slaves[j]) && - master->slaves[j]->repl_offset > myoffset) rank++; - return rank; -} - -/* This function is called by clusterHandleSlaveFailover() in order to - * let the slave log why it is not able to failover. Sometimes there are - * not the conditions, but since the failover function is called again and - * again, we can't log the same things continuously. - * - * This function works by logging only if a given set of conditions are - * true: - * - * 1) The reason for which the failover can't be initiated changed. - * The reasons also include a NONE reason we reset the state to - * when the slave finds that its master is fine (no FAIL flag). - * 2) Also, the log is emitted again if the master is still down and - * the reason for not failing over is still the same, but more than - * CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed. - * 3) Finally, the function only logs if the slave is down for more than - * five seconds + NODE_TIMEOUT. This way nothing is logged when a - * failover starts in a reasonable time. - * - * The function is called with the reason why the slave can't failover - * which is one of the integer macros CLUSTER_CANT_FAILOVER_*. - * - * The function is guaranteed to be called only if 'myself' is a slave. */ -void clusterLogCantFailover(int reason) { - char *msg; - static time_t lastlog_time = 0; - mstime_t nolog_fail_time = server.cluster_node_timeout + 5000; - - /* Don't log if we have the same reason for some time. */ - if (reason == server.cluster->cant_failover_reason && - time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD) - return; - - server.cluster->cant_failover_reason = reason; - - /* We also don't emit any log if the master failed no long ago, the - * goal of this function is to log slaves in a stalled condition for - * a long time. */ - if (myself->slaveof && - nodeFailed(myself->slaveof) && - (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return; - - switch(reason) { - case CLUSTER_CANT_FAILOVER_DATA_AGE: - msg = "Disconnected from master for longer than allowed. " - "Please check the 'cluster-replica-validity-factor' configuration " - "option."; - break; - case CLUSTER_CANT_FAILOVER_WAITING_DELAY: - msg = "Waiting the delay before I can start a new failover."; - break; - case CLUSTER_CANT_FAILOVER_EXPIRED: - msg = "Failover attempt expired."; - break; - case CLUSTER_CANT_FAILOVER_WAITING_VOTES: - msg = "Waiting for votes, but majority still not reached."; - break; - default: - msg = "Unknown reason code."; - break; - } - lastlog_time = time(NULL); - serverLog(LL_NOTICE,"Currently unable to failover: %s", msg); - - int cur_vote = server.cluster->failover_auth_count; - int cur_quorum = (server.cluster->size / 2) + 1; - /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */ - if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) { - serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote); - } -} - -/* This function implements the final part of automatic and manual failovers, - * where the slave grabs its master's hash slots, and propagates the new - * configuration. - * - * Note that it's up to the caller to be sure that the node got a new - * configuration epoch already. */ -void clusterFailoverReplaceYourMaster(void) { - int j; - clusterNode *oldmaster = myself->slaveof; - - if (nodeIsMaster(myself) || oldmaster == NULL) return; - - /* 1) Turn this node into a master. */ - clusterSetNodeAsMaster(myself); - replicationUnsetMaster(); - - /* 2) Claim all the slots assigned to our master. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(oldmaster,j)) { - clusterDelSlot(j); - clusterAddSlot(myself,j); - } - } - - /* 3) Update state and save config. */ - clusterUpdateState(); - clusterSaveConfigOrDie(1); - - /* 4) Pong all the other nodes so that they can update the state - * accordingly and detect that we switched to master role. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - - /* 5) If there was a manual failover in progress, clear the state. */ - resetManualFailover(); -} - -/* This function is called if we are a slave node and our master serving - * a non-zero amount of hash slots is in FAIL state. - * - * The goal of this function is: - * 1) To check if we are able to perform a failover, is our data updated? - * 2) Try to get elected by masters. - * 3) Perform the failover informing all the other nodes. - */ -void clusterHandleSlaveFailover(void) { - mstime_t data_age; - mstime_t auth_age = mstime() - server.cluster->failover_auth_time; - int needed_quorum = (server.cluster->size / 2) + 1; - int manual_failover = server.cluster->mf_end != 0 && - server.cluster->mf_can_start; - mstime_t auth_timeout, auth_retry_time; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER; - - /* Compute the failover timeout (the max time we have to send votes - * and wait for replies), and the failover retry time (the time to wait - * before trying to get voted again). - * - * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds. - * Retry is two times the Timeout. - */ - auth_timeout = server.cluster_node_timeout*2; - if (auth_timeout < 2000) auth_timeout = 2000; - auth_retry_time = auth_timeout*2; - - /* Pre conditions to run the function, that must be met both in case - * of an automatic or manual failover: - * 1) We are a slave. - * 2) Our master is flagged as FAIL, or this is a manual failover. - * 3) We don't have the no failover configuration set, and this is - * not a manual failover. - * 4) It is serving slots. */ - if (nodeIsMaster(myself) || - myself->slaveof == NULL || - (!nodeFailed(myself->slaveof) && !manual_failover) || - (server.cluster_slave_no_failover && !manual_failover) || - myself->slaveof->numslots == 0) - { - /* There are no reasons to failover, so we set the reason why we - * are returning without failing over to NONE. */ - server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; - return; - } - - /* Set data_age to the number of milliseconds we are disconnected from - * the master. */ - if (server.repl_state == REPL_STATE_CONNECTED) { - data_age = (mstime_t)(server.unixtime - server.master->lastinteraction) - * 1000; - } else { - data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; - } - - /* Remove the node timeout from the data age as it is fine that we are - * disconnected from our master at least for the time it was down to be - * flagged as FAIL, that's the baseline. */ - if (data_age > server.cluster_node_timeout) - data_age -= server.cluster_node_timeout; - - /* Check if our data is recent enough according to the slave validity - * factor configured by the user. - * - * Check bypassed for manual failovers. */ - if (server.cluster_slave_validity_factor && - data_age > - (((mstime_t)server.repl_ping_slave_period * 1000) + - (server.cluster_node_timeout * server.cluster_slave_validity_factor))) - { - if (!manual_failover) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE); - return; - } - } - - /* If the previous failover attempt timeout and the retry time has - * elapsed, we can setup a new one. */ - if (auth_age > auth_retry_time) { - server.cluster->failover_auth_time = mstime() + - 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ - random() % 500; /* Random delay between 0 and 500 milliseconds. */ - server.cluster->failover_auth_count = 0; - server.cluster->failover_auth_sent = 0; - server.cluster->failover_auth_rank = clusterGetSlaveRank(); - /* We add another delay that is proportional to the slave rank. - * Specifically 1 second * rank. This way slaves that have a probably - * less updated replication offset, are penalized. */ - server.cluster->failover_auth_time += - server.cluster->failover_auth_rank * 1000; - /* However if this is a manual failover, no delay is needed. */ - if (server.cluster->mf_end) { - server.cluster->failover_auth_time = mstime(); - server.cluster->failover_auth_rank = 0; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - } - serverLog(LL_NOTICE, - "Start of election delayed for %lld milliseconds " - "(rank #%d, offset %lld).", - server.cluster->failover_auth_time - mstime(), - server.cluster->failover_auth_rank, - replicationGetSlaveOffset()); - /* Now that we have a scheduled election, broadcast our offset - * to all the other slaves so that they'll updated their offsets - * if our offset is better. */ - clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES); - return; - } - - /* It is possible that we received more updated offsets from other - * slaves for the same master since we computed our election delay. - * Update the delay if our rank changed. - * - * Not performed if this is a manual failover. */ - if (server.cluster->failover_auth_sent == 0 && - server.cluster->mf_end == 0) - { - int newrank = clusterGetSlaveRank(); - if (newrank > server.cluster->failover_auth_rank) { - long long added_delay = - (newrank - server.cluster->failover_auth_rank) * 1000; - server.cluster->failover_auth_time += added_delay; - server.cluster->failover_auth_rank = newrank; - serverLog(LL_NOTICE, - "Replica rank updated to #%d, added %lld milliseconds of delay.", - newrank, added_delay); - } - } - - /* Return ASAP if we can't still start the election. */ - if (mstime() < server.cluster->failover_auth_time) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); - return; - } - - /* Return ASAP if the election is too old to be valid. */ - if (auth_age > auth_timeout) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED); - return; - } - - /* Ask for votes if needed. */ - if (server.cluster->failover_auth_sent == 0) { - server.cluster->currentEpoch++; - server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.", - (unsigned long long) server.cluster->currentEpoch); - clusterRequestFailoverAuth(); - server.cluster->failover_auth_sent = 1; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - return; /* Wait for replies. */ - } - - /* Check if we reached the quorum. */ - if (server.cluster->failover_auth_count >= needed_quorum) { - /* We have the quorum, we can finally failover the master. */ - - serverLog(LL_NOTICE, - "Failover election won: I'm the new master."); - - /* Update my configEpoch to the epoch of the election. */ - if (myself->configEpoch < server.cluster->failover_auth_epoch) { - myself->configEpoch = server.cluster->failover_auth_epoch; - serverLog(LL_NOTICE, - "configEpoch set to %llu after successful failover", - (unsigned long long) myself->configEpoch); - } - - /* Take responsibility for the cluster slots. */ - clusterFailoverReplaceYourMaster(); - } else { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES); - } -} - -/* ----------------------------------------------------------------------------- - * CLUSTER slave migration - * - * Slave migration is the process that allows a slave of a master that is - * already covered by at least another slave, to "migrate" to a master that - * is orphaned, that is, left with no working slaves. - * ------------------------------------------------------------------------- */ - -/* This function is responsible to decide if this replica should be migrated - * to a different (orphaned) master. It is called by the clusterCron() function - * only if: - * - * 1) We are a slave node. - * 2) It was detected that there is at least one orphaned master in - * the cluster. - * 3) We are a slave of one of the masters with the greatest number of - * slaves. - * - * This checks are performed by the caller since it requires to iterate - * the nodes anyway, so we spend time into clusterHandleSlaveMigration() - * if definitely needed. - * - * The function is called with a pre-computed max_slaves, that is the max - * number of working (not in FAIL state) slaves for a single master. - * - * Additional conditions for migration are examined inside the function. - */ -void clusterHandleSlaveMigration(int max_slaves) { - int j, okslaves = 0; - clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL; - dictIterator *di; - dictEntry *de; - - /* Step 1: Don't migrate if the cluster state is not ok. */ - if (server.cluster->state != CLUSTER_OK) return; - - /* Step 2: Don't migrate if my master will not be left with at least - * 'migration-barrier' slaves after my migration. */ - if (mymaster == NULL) return; - for (j = 0; j < mymaster->numslaves; j++) - if (!nodeFailed(mymaster->slaves[j]) && - !nodeTimedOut(mymaster->slaves[j])) okslaves++; - if (okslaves <= server.cluster_migration_barrier) return; - - /* Step 3: Identify a candidate for migration, and check if among the - * masters with the greatest number of ok slaves, I'm the one with the - * smallest node ID (the "candidate slave"). - * - * Note: this means that eventually a replica migration will occur - * since slaves that are reachable again always have their FAIL flag - * cleared, so eventually there must be a candidate. - * There is a possible race condition causing multiple - * slaves to migrate at the same time, but this is unlikely to - * happen and relatively harmless when it does. */ - candidate = myself; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - int okslaves = 0, is_orphaned = 1; - - /* We want to migrate only if this master is working, orphaned, and - * used to have slaves or if failed over a master that had slaves - * (MIGRATE_TO flag). This way we only migrate to instances that were - * supposed to have replicas. */ - if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0; - if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0; - - /* Check number of working slaves. */ - if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); - if (okslaves > 0) is_orphaned = 0; - - if (is_orphaned) { - if (!target && node->numslots > 0) target = node; - - /* Track the starting time of the orphaned condition for this - * master. */ - if (!node->orphaned_time) node->orphaned_time = mstime(); - } else { - node->orphaned_time = 0; - } - - /* Check if I'm the slave candidate for the migration: attached - * to a master with the maximum number of slaves and with the smallest - * node ID. */ - if (okslaves == max_slaves) { - for (j = 0; j < node->numslaves; j++) { - if (memcmp(node->slaves[j]->name, - candidate->name, - CLUSTER_NAMELEN) < 0) - { - candidate = node->slaves[j]; - } - } - } - } - dictReleaseIterator(di); - - /* Step 4: perform the migration if there is a target, and if I'm the - * candidate, but only if the master is continuously orphaned for a - * couple of seconds, so that during failovers, we give some time to - * the natural slaves of this instance to advertise their switch from - * the old master to the new one. */ - if (target && candidate == myself && - (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY && - !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - { - serverLog(LL_NOTICE,"Migrating to orphaned master %.40s", - target->name); - clusterSetMaster(target); - } -} - -/* ----------------------------------------------------------------------------- - * CLUSTER manual failover - * - * This are the important steps performed by slaves during a manual failover: - * 1) User send CLUSTER FAILOVER command. The failover state is initialized - * setting mf_end to the millisecond unix time at which we'll abort the - * attempt. - * 2) Slave sends a MFSTART message to the master requesting to pause clients - * for two times the manual failover timeout CLUSTER_MF_TIMEOUT. - * When master is paused for manual failover, it also starts to flag - * packets with CLUSTERMSG_FLAG0_PAUSED. - * 3) Slave waits for master to send its replication offset flagged as PAUSED. - * 4) If slave received the offset from the master, and its offset matches, - * mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform - * the failover as usually, with the difference that the vote request - * will be modified to force masters to vote for a slave that has a - * working master. - * - * From the point of view of the master things are simpler: when a - * PAUSE_CLIENTS packet is received the master sets mf_end as well and - * the sender in mf_slave. During the time limit for the manual failover - * the master will just send PINGs more often to this slave, flagged with - * the PAUSED flag, so that the slave will set mf_master_offset when receiving - * a packet from the master with this flag set. - * - * The goal of the manual failover is to perform a fast failover without - * data loss due to the asynchronous master-slave replication. - * -------------------------------------------------------------------------- */ - -/* Reset the manual failover state. This works for both masters and slaves - * as all the state about manual failover is cleared. - * - * The function can be used both to initialize the manual failover state at - * startup or to abort a manual failover in progress. */ -void resetManualFailover(void) { - if (server.cluster->mf_slave) { - /* We were a master failing over, so we paused clients and related actions. - * Regardless of the outcome we unpause now to allow traffic again. */ - unpauseActions(PAUSE_DURING_FAILOVER); - } - server.cluster->mf_end = 0; /* No manual failover in progress. */ - server.cluster->mf_can_start = 0; - server.cluster->mf_slave = NULL; - server.cluster->mf_master_offset = -1; -} - -/* If a manual failover timed out, abort it. */ -void manualFailoverCheckTimeout(void) { - if (server.cluster->mf_end && server.cluster->mf_end < mstime()) { - serverLog(LL_WARNING,"Manual failover timed out."); - resetManualFailover(); - } -} - -/* This function is called from the cluster cron function in order to go - * forward with a manual failover state machine. */ -void clusterHandleManualFailover(void) { - /* Return ASAP if no manual failover is in progress. */ - if (server.cluster->mf_end == 0) return; - - /* If mf_can_start is non-zero, the failover was already triggered so the - * next steps are performed by clusterHandleSlaveFailover(). */ - if (server.cluster->mf_can_start) return; - - if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */ - - if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) { - /* Our replication offset matches the master replication offset - * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; - serverLog(LL_NOTICE, - "All master replication stream processed, " - "manual failover can start."); - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - return; - } - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER cron job - * -------------------------------------------------------------------------- */ - -/* Check if the node is disconnected and re-establish the connection. - * Also update a few stats while we are here, that can be used to make - * better decisions in other part of the code. */ -static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { - /* Not interested in reconnecting the link with myself or nodes - * for which we have no address. */ - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1; - - if (node->flags & CLUSTER_NODE_PFAIL) - server.cluster->stats_pfail_nodes++; - - /* A Node in HANDSHAKE state has a limited lifespan equal to the - * configured node timeout. */ - if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { - clusterDelNode(node); - return 1; - } - - if (node->link == NULL) { - clusterLink *link = createClusterLink(node); - link->conn = connCreate(connTypeOfCluster()); - connSetPrivateData(link->conn, link); - if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, - clusterLinkConnectHandler) == C_ERR) { - /* We got a synchronous error from connect before - * clusterSendPing() had a chance to be called. - * If node->ping_sent is zero, failure detection can't work, - * so we claim we actually sent a ping now (that will - * be really sent as soon as the link is obtained). */ - if (node->ping_sent == 0) node->ping_sent = mstime(); - serverLog(LL_DEBUG, "Unable to connect to " - "Cluster Node [%s]:%d -> %s", node->ip, - node->cport, server.neterr); - - freeClusterLink(link); - return 0; - } - } - return 0; -} - -static void freeClusterLinkOnBufferLimitReached(clusterLink *link) { - if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) { - return; - } - - unsigned long long mem_link = link->send_msg_queue_mem; - if (mem_link > server.cluster_link_msg_queue_limit_bytes) { - serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to " - "exceeding send buffer memory limit.", link->inbound ? "from" : "to", - link->node ? link->node->name : "", mem_link); - freeClusterLink(link); - server.cluster->stat_cluster_links_buffer_limit_exceeded++; - } -} - -/* Free outbound link to a node if its send buffer size exceeded limit. */ -static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) { - freeClusterLinkOnBufferLimitReached(node->link); - freeClusterLinkOnBufferLimitReached(node->inbound_link); -} - -/* This is executed 10 times every second */ -void clusterCron(void) { - dictIterator *di; - dictEntry *de; - int update_state = 0; - int orphaned_masters; /* How many masters there are without ok slaves. */ - int max_slaves; /* Max number of ok slaves for a single master. */ - int this_slaves; /* Number of ok slaves for our master (if we are slave). */ - mstime_t min_pong = 0, now = mstime(); - clusterNode *min_pong_node = NULL; - static unsigned long long iteration = 0; - mstime_t handshake_timeout; - - iteration++; /* Number of times this function was called so far. */ - - clusterUpdateMyselfHostname(); - - /* The handshake timeout is the time after which a handshake node that was - * not turned into a normal node is removed from the nodes. Usually it is - * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use - * the value of 1 second. */ - handshake_timeout = server.cluster_node_timeout; - if (handshake_timeout < 1000) handshake_timeout = 1000; - - /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ - server.cluster->stats_pfail_nodes = 0; - /* Run through some of the operations we want to do on each cluster node. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - /* We free the inbound or outboud link to the node if the link has an - * oversized message send queue and immediately try reconnecting. */ - clusterNodeCronFreeLinkOnBufferLimitReached(node); - /* The protocol is that function(s) below return non-zero if the node was - * terminated. - */ - if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; - } - dictReleaseIterator(di); - - /* Ping some random node 1 time every 10 iterations, so that we usually ping - * one random node every second. */ - if (!(iteration % 10)) { - int j; - - /* Check a few random nodes and ping the one with the oldest - * pong_received time. */ - for (j = 0; j < 5; j++) { - de = dictGetRandomKey(server.cluster->nodes); - clusterNode *this = dictGetVal(de); - - /* Don't ping nodes disconnected or with a ping currently active. */ - if (this->link == NULL || this->ping_sent != 0) continue; - if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - if (min_pong_node == NULL || min_pong > this->pong_received) { - min_pong_node = this; - min_pong = this->pong_received; - } - } - if (min_pong_node) { - serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name); - clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); - } - } - - /* Iterate nodes to check if we need to flag something as failing. - * This loop is also responsible to: - * 1) Check if there are orphaned masters (masters without non failing - * slaves). - * 2) Count the max number of non failing slaves for a single master. - * 3) Count the number of slaves for our master, if we are a slave. */ - orphaned_masters = 0; - max_slaves = 0; - this_slaves = 0; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - now = mstime(); /* Use an updated time at every iteration. */ - - if (node->flags & - (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) - continue; - - /* Orphaned master check, useful only if the current instance - * is a slave that may migrate to another master. */ - if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) { - int okslaves = clusterCountNonFailingSlaves(node); - - /* A master is orphaned if it is serving a non-zero number of - * slots, have no working slaves, but used to have at least one - * slave, or failed over a master that used to have slaves. */ - if (okslaves == 0 && node->numslots > 0 && - node->flags & CLUSTER_NODE_MIGRATE_TO) - { - orphaned_masters++; - } - if (okslaves > max_slaves) max_slaves = okslaves; - if (myself->slaveof == node) - this_slaves = okslaves; - } - - /* If we are not receiving any data for more than half the cluster - * timeout, reconnect the link: maybe there is a connection - * issue even if the node is alive. */ - mstime_t ping_delay = now - node->ping_sent; - mstime_t data_delay = now - node->data_received; - if (node->link && /* is connected */ - now - node->link->ctime > - server.cluster_node_timeout && /* was not already reconnected */ - node->ping_sent && /* we already sent a ping */ - /* and we are waiting for the pong more than timeout/2 */ - ping_delay > server.cluster_node_timeout/2 && - /* and in such interval we are not seeing any traffic at all. */ - data_delay > server.cluster_node_timeout/2) - { - /* Disconnect the link, it will be reconnected automatically. */ - freeClusterLink(node->link); - } - - /* If we have currently no active ping in this instance, and the - * received PONG is older than half the cluster timeout, send - * a new ping now, to ensure all the nodes are pinged without - * a too big delay. */ - mstime_t ping_interval = server.cluster_ping_interval ? - server.cluster_ping_interval : server.cluster_node_timeout/2; - if (node->link && - node->ping_sent == 0 && - (now - node->pong_received) > ping_interval) - { - clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); - continue; - } - - /* If we are a master and one of the slaves requested a manual - * failover, ping it continuously. */ - if (server.cluster->mf_end && - nodeIsMaster(myself) && - server.cluster->mf_slave == node && - node->link) - { - clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); - continue; - } - - /* Check only if we have an active ping for this instance. */ - if (node->ping_sent == 0) continue; - - /* Check if this node looks unreachable. - * Note that if we already received the PONG, then node->ping_sent - * is zero, so can't reach this code at all, so we don't risk of - * checking for a PONG delay if we didn't sent the PING. - * - * We also consider every incoming data as proof of liveness, since - * our cluster bus link is also used for data: under heavy data - * load pong delays are possible. */ - mstime_t node_delay = (ping_delay < data_delay) ? ping_delay : - data_delay; - - if (node_delay > server.cluster_node_timeout) { - /* Timeout reached. Set the node as possibly failing if it is - * not already in this state. */ - if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) { - serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", - node->name); - node->flags |= CLUSTER_NODE_PFAIL; - update_state = 1; - } - } - } - dictReleaseIterator(di); - - /* If we are a slave node but the replication is still turned off, - * enable it if we know the address of our master and it appears to - * be up. */ - if (nodeIsSlave(myself) && - server.masterhost == NULL && - myself->slaveof && - nodeHasAddr(myself->slaveof)) - { - replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof)); - } - - /* Abort a manual failover if the timeout is reached. */ - manualFailoverCheckTimeout(); - - if (nodeIsSlave(myself)) { - clusterHandleManualFailover(); - if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - clusterHandleSlaveFailover(); - /* If there are orphaned slaves, and we are a slave among the masters - * with the max number of non-failing slaves, consider migrating to - * the orphaned masters. Note that it does not make sense to try - * a migration if there is no master with at least *two* working - * slaves. */ - if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves && - server.cluster_allow_replica_migration) - clusterHandleSlaveMigration(max_slaves); - } - - if (update_state || server.cluster->state == CLUSTER_FAIL) - clusterUpdateState(); -} - -/* This function is called before the event handler returns to sleep for - * events. It is useful to perform operations that must be done ASAP in - * reaction to events fired but that are not safe to perform inside event - * handlers, or to perform potentially expansive tasks that we need to do - * a single time before replying to clients. */ -void clusterBeforeSleep(void) { - int flags = server.cluster->todo_before_sleep; - - /* Reset our flags (not strictly needed since every single function - * called for flags set should be able to clear its flag). */ - server.cluster->todo_before_sleep = 0; - - if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) { - /* Handle manual failover as soon as possible so that won't have a 100ms - * as it was handled only in clusterCron */ - if(nodeIsSlave(myself)) { - clusterHandleManualFailover(); - if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - clusterHandleSlaveFailover(); - } - } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) { - /* Handle failover, this is needed when it is likely that there is already - * the quorum from masters in order to react fast. */ - clusterHandleSlaveFailover(); - } - - /* Update the cluster state. */ - if (flags & CLUSTER_TODO_UPDATE_STATE) - clusterUpdateState(); - - /* Save the config, possibly using fsync. */ - if (flags & CLUSTER_TODO_SAVE_CONFIG) { - int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; - clusterSaveConfigOrDie(fsync); - } -} - -void clusterDoBeforeSleep(int flags) { - server.cluster->todo_before_sleep |= flags; -} - -/* ----------------------------------------------------------------------------- - * Slots management - * -------------------------------------------------------------------------- */ - -/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set, - * otherwise 0. */ -int bitmapTestBit(unsigned char *bitmap, int pos) { - off_t byte = pos/8; - int bit = pos&7; - return (bitmap[byte] & (1<nodes); - dictEntry *de; - int slaves = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (nodeIsSlave(node)) continue; - slaves += node->numslaves; - } - dictReleaseIterator(di); - return slaves != 0; -} - -/* Set the slot bit and return the old value. */ -int clusterNodeSetSlotBit(clusterNode *n, int slot) { - int old = bitmapTestBit(n->slots,slot); - if (!old) { - bitmapSetBit(n->slots,slot); - n->numslots++; - /* When a master gets its first slot, even if it has no slaves, - * it gets flagged with MIGRATE_TO, that is, the master is a valid - * target for replicas migration, if and only if at least one of - * the other masters has slaves right now. - * - * Normally masters are valid targets of replica migration if: - * 1. The used to have slaves (but no longer have). - * 2. They are slaves failing over a master that used to have slaves. - * - * However new masters with slots assigned are considered valid - * migration targets if the rest of the cluster is not a slave-less. - * - * See https://github.com/redis/redis/issues/3043 for more info. */ - if (n->numslots == 1 && clusterMastersHaveSlaves()) - n->flags |= CLUSTER_NODE_MIGRATE_TO; - } - return old; -} - -/* Clear the slot bit and return the old value. */ -int clusterNodeClearSlotBit(clusterNode *n, int slot) { - int old = bitmapTestBit(n->slots,slot); - if (old) { - bitmapClearBit(n->slots,slot); - n->numslots--; - } - return old; -} - -/* Return the slot bit from the cluster node structure. */ -int clusterNodeGetSlotBit(clusterNode *n, int slot) { - return bitmapTestBit(n->slots,slot); -} - -/* Add the specified slot to the list of slots that node 'n' will - * serve. Return C_OK if the operation ended with success. - * If the slot is already assigned to another instance this is considered - * an error and C_ERR is returned. */ -int clusterAddSlot(clusterNode *n, int slot) { - if (server.cluster->slots[slot]) return C_ERR; - clusterNodeSetSlotBit(n,slot); - server.cluster->slots[slot] = n; - return C_OK; -} - -/* Delete the specified slot marking it as unassigned. - * Returns C_OK if the slot was assigned, otherwise if the slot was - * already unassigned C_ERR is returned. */ -int clusterDelSlot(int slot) { - clusterNode *n = server.cluster->slots[slot]; - - if (!n) return C_ERR; - - /* Cleanup the channels in master/replica as part of slot deletion. */ - removeChannelsInSlot(slot); - /* Clear the slot bit. */ - serverAssert(clusterNodeClearSlotBit(n,slot) == 1); - server.cluster->slots[slot] = NULL; - /* Make owner_not_claiming_slot flag consistent with slot ownership information. */ - bitmapClearBit(server.cluster->owner_not_claiming_slot, slot); - return C_OK; -} - -/* Delete all the slots associated with the specified node. - * The number of deleted slots is returned. */ -int clusterDelNodeSlots(clusterNode *node) { - int deleted = 0, j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(node,j)) { - clusterDelSlot(j); - deleted++; - } - } - return deleted; -} - -/* Clear the migrating / importing state for all the slots. - * This is useful at initialization and when turning a master into slave. */ -void clusterCloseAllSlots(void) { - memset(server.cluster->migrating_slots_to,0, - sizeof(server.cluster->migrating_slots_to)); - memset(server.cluster->importing_slots_from,0, - sizeof(server.cluster->importing_slots_from)); -} - -/* ----------------------------------------------------------------------------- - * Cluster state evaluation function - * -------------------------------------------------------------------------- */ - -/* The following are defines that are only used in the evaluation function - * and are based on heuristics. Actually the main point about the rejoin and - * writable delay is that they should be a few orders of magnitude larger - * than the network latency. */ -#define CLUSTER_MAX_REJOIN_DELAY 5000 -#define CLUSTER_MIN_REJOIN_DELAY 500 -#define CLUSTER_WRITABLE_DELAY 2000 - -void clusterUpdateState(void) { - int j, new_state; - int reachable_masters = 0; - static mstime_t among_minority_time; - static mstime_t first_call_time = 0; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE; - - /* If this is a master node, wait some time before turning the state - * into OK, since it is not a good idea to rejoin the cluster as a writable - * master, after a reboot, without giving the cluster a chance to - * reconfigure this node. Note that the delay is calculated starting from - * the first call to this function and not since the server start, in order - * to not count the DB loading time. */ - if (first_call_time == 0) first_call_time = mstime(); - if (nodeIsMaster(myself) && - server.cluster->state == CLUSTER_FAIL && - mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return; - - /* Start assuming the state is OK. We'll turn it into FAIL if there - * are the right conditions. */ - new_state = CLUSTER_OK; - - /* Check if all the slots are covered. */ - if (server.cluster_require_full_coverage) { - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) - { - new_state = CLUSTER_FAIL; - break; - } - } - } - - /* Compute the cluster size, that is the number of master nodes - * serving at least a single slot. - * - * At the same time count the number of reachable masters having - * at least one slot. */ - { - dictIterator *di; - dictEntry *de; - - server.cluster->size = 0; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (nodeIsMaster(node) && node->numslots) { - server.cluster->size++; - if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0) - reachable_masters++; - } - } - dictReleaseIterator(di); - } - - /* If we are in a minority partition, change the cluster state - * to FAIL. */ - { - int needed_quorum = (server.cluster->size / 2) + 1; - - if (reachable_masters < needed_quorum) { - new_state = CLUSTER_FAIL; - among_minority_time = mstime(); - } - } - - /* Log a state change */ - if (new_state != server.cluster->state) { - mstime_t rejoin_delay = server.cluster_node_timeout; - - /* If the instance is a master and was partitioned away with the - * minority, don't let it accept queries for some time after the - * partition heals, to make sure there is enough time to receive - * a configuration update. */ - if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY) - rejoin_delay = CLUSTER_MAX_REJOIN_DELAY; - if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY) - rejoin_delay = CLUSTER_MIN_REJOIN_DELAY; - - if (new_state == CLUSTER_OK && - nodeIsMaster(myself) && - mstime() - among_minority_time < rejoin_delay) - { - return; - } - - /* Change the state and log the event. */ - serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, - "Cluster state changed: %s", - new_state == CLUSTER_OK ? "ok" : "fail"); - server.cluster->state = new_state; - } -} - -/* This function is called after the node startup in order to verify that data - * loaded from disk is in agreement with the cluster configuration: - * - * 1) If we find keys about hash slots we have no responsibility for, the - * following happens: - * A) If no other node is in charge according to the current cluster - * configuration, we add these slots to our node. - * B) If according to our config other nodes are already in charge for - * this slots, we set the slots as IMPORTING from our point of view - * in order to justify we have those slots, and in order to make - * redis-cli aware of the issue, so that it can try to fix it. - * 2) If we find data in a DB different than DB0 we return C_ERR to - * signal the caller it should quit the server with an error message - * or take other actions. - * - * The function always returns C_OK even if it will try to correct - * the error described in "1". However if data is found in DB different - * from DB0, C_ERR is returned. - * - * The function also uses the logging facility in order to warn the user - * about desynchronizations between the data we have in memory and the - * cluster configuration. */ -int verifyClusterConfigWithData(void) { - int j; - int update_config = 0; - - /* Return ASAP if a module disabled cluster redirections. In that case - * every master can store keys about every possible hash slot. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return C_OK; - - /* If this node is a slave, don't perform the check at all as we - * completely depend on the replication stream. */ - if (nodeIsSlave(myself)) return C_OK; - - /* Make sure we only have keys in DB0. */ - for (j = 1; j < server.dbnum; j++) { - if (dbSize(&server.db[j], DB_MAIN)) return C_ERR; - } - - /* Check that all the slots we see populated memory have a corresponding - * entry in the cluster table. Otherwise fix the table. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (!countKeysInSlot(j)) continue; /* No keys in this slot. */ - /* Check if we are assigned to this slot or if we are importing it. - * In both cases check the next slot as the configuration makes - * sense. */ - if (server.cluster->slots[j] == myself || - server.cluster->importing_slots_from[j] != NULL) continue; - - /* If we are here data and cluster config don't agree, and we have - * slot 'j' populated even if we are not importing it, nor we are - * assigned to this slot. Fix this condition. */ - - update_config++; - /* Case A: slot is unassigned. Take responsibility for it. */ - if (server.cluster->slots[j] == NULL) { - serverLog(LL_NOTICE, "I have keys for unassigned slot %d. " - "Taking responsibility for it.",j); - clusterAddSlot(myself,j); - } else { - serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is " - "assigned to another node. " - "Setting it to importing state.",j); - server.cluster->importing_slots_from[j] = server.cluster->slots[j]; - } - } - if (update_config) clusterSaveConfigOrDie(1); - return C_OK; -} - -/* Remove all the shard channel related information not owned by the current shard. */ -static inline void removeAllNotOwnedShardChannelSubscriptions(void) { - if (!dictSize(server.pubsubshard_channels)) return; - clusterNode *currmaster = nodeIsMaster(myself) ? myself : myself->slaveof; - for (int j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->slots[j] != currmaster) { - removeChannelsInSlot(j); - } - } -} - -/* ----------------------------------------------------------------------------- - * SLAVE nodes handling - * -------------------------------------------------------------------------- */ - -/* Set the specified node 'n' as master for this node. - * If this node is currently a master, it is turned into a slave. */ -void clusterSetMaster(clusterNode *n) { - serverAssert(n != myself); - serverAssert(myself->numslots == 0); - - if (nodeIsMaster(myself)) { - myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO); - myself->flags |= CLUSTER_NODE_SLAVE; - clusterCloseAllSlots(); - } else { - if (myself->slaveof) - clusterNodeRemoveSlave(myself->slaveof,myself); - } - myself->slaveof = n; - updateShardId(myself, n->shard_id); - clusterNodeAddSlave(n,myself); - replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n)); - removeAllNotOwnedShardChannelSubscriptions(); - resetManualFailover(); -} - -/* ----------------------------------------------------------------------------- - * Nodes to string representation functions. - * -------------------------------------------------------------------------- */ - -struct redisNodeFlags { - uint16_t flag; - char *name; -}; - -static struct redisNodeFlags redisNodeFlagsTable[] = { - {CLUSTER_NODE_MYSELF, "myself,"}, - {CLUSTER_NODE_MASTER, "master,"}, - {CLUSTER_NODE_SLAVE, "slave,"}, - {CLUSTER_NODE_PFAIL, "fail?,"}, - {CLUSTER_NODE_FAIL, "fail,"}, - {CLUSTER_NODE_HANDSHAKE, "handshake,"}, - {CLUSTER_NODE_NOADDR, "noaddr,"}, - {CLUSTER_NODE_NOFAILOVER, "nofailover,"} -}; - -/* Concatenate the comma separated list of node flags to the given SDS - * string 'ci'. */ -sds representClusterNodeFlags(sds ci, uint16_t flags) { - size_t orig_len = sdslen(ci); - int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags); - for (i = 0; i < size; i++) { - struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i; - if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name); - } - /* If no flag was added, add the "noflags" special flag. */ - if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,"); - sdsIncrLen(ci,-1); /* Remove trailing comma. */ - return ci; -} - -/* Concatenate the slot ownership information to the given SDS string 'ci'. - * If the slot ownership is in a contiguous block, it's represented as start-end pair, - * else each slot is added separately. */ -sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { - for (int i = 0; i< slot_info_pairs_count; i+=2) { - unsigned long start = slot_info_pairs[i]; - unsigned long end = slot_info_pairs[i+1]; - if (start == end) { - ci = sdscatfmt(ci, " %i", start); - } else { - ci = sdscatfmt(ci, " %i-%i", start, end); - } - } - return ci; -} - -/* Generate a csv-alike representation of the specified cluster node. - * See clusterGenNodesDescription() top comment for more information. - * - * The function returns the string representation as an SDS string. */ -sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { - int j, start; - sds ci; - int port = getNodeClientPort(node, tls_primary); - - /* Node coordinates */ - ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN); - ci = sdscatfmt(ci," %s:%i@%i", - node->ip, - port, - node->cport); - if (sdslen(node->hostname) != 0) { - ci = sdscatfmt(ci,",%s", node->hostname); - } - /* Don't expose aux fields to any clients yet but do allow them - * to be persisted to nodes.conf */ - if (c == NULL) { - if (sdslen(node->hostname) == 0) { - ci = sdscatfmt(ci,",", 1); - } - for (int i = af_count-1; i >=0; i--) { - if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) { - continue; - } - if (auxFieldHandlers[i].isPresent(node)) { - ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field); - ci = auxFieldHandlers[i].getter(node, ci); - } - } - } - - /* Flags */ - ci = sdscatlen(ci," ",1); - ci = representClusterNodeFlags(ci, node->flags); - - /* Slave of... or just "-" */ - ci = sdscatlen(ci," ",1); - if (node->slaveof) - ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN); - else - ci = sdscatlen(ci,"-",1); - - unsigned long long nodeEpoch = node->configEpoch; - if (nodeIsSlave(node) && node->slaveof) { - nodeEpoch = node->slaveof->configEpoch; - } - /* Latency from the POV of this node, config epoch, link status */ - ci = sdscatfmt(ci," %I %I %U %s", - (long long) node->ping_sent, - (long long) node->pong_received, - nodeEpoch, - (node->link || node->flags & CLUSTER_NODE_MYSELF) ? - "connected" : "disconnected"); - - /* Slots served by this instance. If we already have slots info, - * append it directly, otherwise, generate slots only if it has. */ - if (node->slot_info_pairs) { - ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count); - } else if (node->numslots > 0) { - start = -1; - for (j = 0; j < CLUSTER_SLOTS; j++) { - int bit; - - if ((bit = clusterNodeGetSlotBit(node,j)) != 0) { - if (start == -1) start = j; - } - if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) { - if (bit && j == CLUSTER_SLOTS-1) j++; - - if (start == j-1) { - ci = sdscatfmt(ci," %i",start); - } else { - ci = sdscatfmt(ci," %i-%i",start,j-1); - } - start = -1; - } - } - } - - /* Just for MYSELF node we also dump info about slots that - * we are migrating to other instances or importing from other - * instances. */ - if (node->flags & CLUSTER_NODE_MYSELF) { - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->migrating_slots_to[j]) { - ci = sdscatprintf(ci," [%d->-%.40s]",j, - server.cluster->migrating_slots_to[j]->name); - } else if (server.cluster->importing_slots_from[j]) { - ci = sdscatprintf(ci," [%d-<-%.40s]",j, - server.cluster->importing_slots_from[j]->name); - } - } - } - return ci; -} - -/* Generate the slot topology for all nodes and store the string representation - * in the slots_info struct on the node. This is used to improve the efficiency - * of clusterGenNodesDescription() because it removes looping of the slot space - * for generating the slot info for each node individually. */ -void clusterGenNodesSlotsInfo(int filter) { - clusterNode *n = NULL; - int start = -1; - - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - /* Find start node and slot id. */ - if (n == NULL) { - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - continue; - } - - /* Generate slots info when occur different node with start - * or end of slot. */ - if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { - if (!(n->flags & filter)) { - if (!n->slot_info_pairs) { - n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t)); - } - serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots)); - n->slot_info_pairs[n->slot_info_pairs_count++] = start; - n->slot_info_pairs[n->slot_info_pairs_count++] = i-1; - } - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - } - } -} - -void clusterFreeNodesSlotsInfo(clusterNode *n) { - zfree(n->slot_info_pairs); - n->slot_info_pairs = NULL; - n->slot_info_pairs_count = 0; -} - -/* Generate a csv-alike representation of the nodes we are aware of, - * including the "myself" node, and return an SDS string containing the - * representation (it is up to the caller to free it). - * - * All the nodes matching at least one of the node flags specified in - * "filter" are excluded from the output, so using zero as a filter will - * include all the known nodes in the representation, including nodes in - * the HANDSHAKE state. - * - * Setting tls_primary to 1 to put TLS port in the main : - * field and put TCP port in aux field, instead of the opposite way. - * - * The representation obtained using this function is used for the output - * of the CLUSTER NODES function, and as format for the cluster - * configuration file (nodes.conf) for a given node. */ -sds clusterGenNodesDescription(client *c, int filter, int tls_primary) { - sds ci = sdsempty(), ni; - dictIterator *di; - dictEntry *de; - - /* Generate all nodes slots info firstly. */ - clusterGenNodesSlotsInfo(filter); - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node->flags & filter) continue; - ni = clusterGenNodeDescription(c, node, tls_primary); - ci = sdscatsds(ci,ni); - sdsfree(ni); - ci = sdscatlen(ci,"\n",1); - - /* Release slots info. */ - clusterFreeNodesSlotsInfo(node); - } - dictReleaseIterator(di); - return ci; -} - -/* Add to the output buffer of the given client the description of the given cluster link. - * The description is a map with each entry being an attribute of the link. */ -void addReplyClusterLinkDescription(client *c, clusterLink *link) { - addReplyMapLen(c, 6); - - addReplyBulkCString(c, "direction"); - addReplyBulkCString(c, link->inbound ? "from" : "to"); - - /* addReplyClusterLinkDescription is only called for links that have been - * associated with nodes. The association is always bi-directional, so - * in addReplyClusterLinkDescription, link->node should never be NULL. */ - serverAssert(link->node); - sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN); - addReplyBulkCString(c, "node"); - addReplyBulkCString(c, node_name); - sdsfree(node_name); - - addReplyBulkCString(c, "create-time"); - addReplyLongLong(c, link->ctime); - - char events[3], *p; - p = events; - if (link->conn) { - if (connHasReadHandler(link->conn)) *p++ = 'r'; - if (connHasWriteHandler(link->conn)) *p++ = 'w'; - } - *p = '\0'; - addReplyBulkCString(c, "events"); - addReplyBulkCString(c, events); - - addReplyBulkCString(c, "send-buffer-allocated"); - addReplyLongLong(c, link->send_msg_queue_mem); - - addReplyBulkCString(c, "send-buffer-used"); - addReplyLongLong(c, link->send_msg_queue_mem); -} - -/* Add to the output buffer of the given client an array of cluster link descriptions, - * with array entry being a description of a single current cluster link. */ -void addReplyClusterLinksDescription(client *c) { - dictIterator *di; - dictEntry *de; - void *arraylen_ptr = NULL; - int num_links = 0; - - arraylen_ptr = addReplyDeferredLen(c); - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->link) { - num_links++; - addReplyClusterLinkDescription(c, node->link); - } - if (node->inbound_link) { - num_links++; - addReplyClusterLinkDescription(c, node->inbound_link); - } - } - dictReleaseIterator(di); - - setDeferredArrayLen(c, arraylen_ptr, num_links); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER command - * -------------------------------------------------------------------------- */ - -const char *getPreferredEndpoint(clusterNode *n) { - switch(server.cluster_preferred_endpoint_type) { - case CLUSTER_ENDPOINT_TYPE_IP: return n->ip; - case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (sdslen(n->hostname) != 0) ? n->hostname : "?"; - case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return ""; - } - return "unknown"; -} - -const char *clusterGetMessageTypeString(int type) { - switch(type) { - case CLUSTERMSG_TYPE_PING: return "ping"; - case CLUSTERMSG_TYPE_PONG: return "pong"; - case CLUSTERMSG_TYPE_MEET: return "meet"; - case CLUSTERMSG_TYPE_FAIL: return "fail"; - case CLUSTERMSG_TYPE_PUBLISH: return "publish"; - case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard"; - case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req"; - case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack"; - case CLUSTERMSG_TYPE_UPDATE: return "update"; - case CLUSTERMSG_TYPE_MFSTART: return "mfstart"; - case CLUSTERMSG_TYPE_MODULE: return "module"; - } - return "unknown"; -} - -int getSlotOrReply(client *c, robj *o) { - long long slot; - - if (getLongLongFromObject(o,&slot) != C_OK || - slot < 0 || slot >= CLUSTER_SLOTS) - { - addReplyError(c,"Invalid or out of range slot"); - return -1; - } - return (int) slot; -} - -/* Returns an indication if the replica node is fully available - * and should be listed in CLUSTER SLOTS response. - * Returns 1 for available nodes, 0 for nodes that have - * not finished their initial sync, in failed state, or are - * otherwise considered not available to serve read commands. */ -static int isReplicaAvailable(clusterNode *node) { - if (nodeFailed(node)) { - return 0; - } - long long repl_offset = node->repl_offset; - if (node->flags & CLUSTER_NODE_MYSELF) { - /* Nodes do not update their own information - * in the cluster node list. */ - repl_offset = replicationGetSlaveOffset(); - } - return (repl_offset != 0); -} - -int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) { - int slot; - for (slot = start_slot; slot <= end_slot; slot++) { - if (del && server.cluster->slots[slot] == NULL) { - addReplyErrorFormat(c,"Slot %d is already unassigned", slot); - return C_ERR; - } else if (!del && server.cluster->slots[slot]) { - addReplyErrorFormat(c,"Slot %d is already busy", slot); - return C_ERR; - } - if (slots[slot]++ == 1) { - addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot); - return C_ERR; - } - } - return C_OK; -} - -void clusterUpdateSlots(client *c, unsigned char *slots, int del) { - int j; - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (slots[j]) { - int retval; - - /* If this slot was set as importing we can clear this - * state as now we are the real owner of the slot. */ - if (server.cluster->importing_slots_from[j]) - server.cluster->importing_slots_from[j] = NULL; - - retval = del ? clusterDelSlot(j) : - clusterAddSlot(myself,j); - serverAssertWithInfo(c,NULL,retval == C_OK); - } - } -} - -void addNodeToNodeReply(client *c, clusterNode *node) { - addReplyArrayLen(c, 4); - if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, node->ip); - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { - if (sdslen(node->hostname) != 0) { - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - } else { - addReplyBulkCString(c, "?"); - } - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { - addReplyNull(c); - } else { - serverPanic("Unrecognized preferred endpoint type"); - } - - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyLongLong(c, getNodeClientPort(node, shouldReturnTlsInfo())); - addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); - - /* Add the additional endpoint information, this is all the known networking information - * that is not the preferred endpoint. Note the logic is evaluated twice so we can - * correctly report the number of additional network arguments without using a deferred - * map, an assertion is made at the end to check we set the right length. */ - int length = 0; - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - length++; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - length++; - } - addReplyMapLen(c, length); - - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, "ip"); - addReplyBulkCString(c, node->ip); - length--; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - addReplyBulkCString(c, "hostname"); - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - length--; - } - serverAssert(length == 0); -} - -void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { - int i, nested_elements = 3; /* slots (2) + master addr (1) */ - for (i = 0; i < node->numslaves; i++) { - if (!isReplicaAvailable(node->slaves[i])) continue; - nested_elements++; - } - addReplyArrayLen(c, nested_elements); - addReplyLongLong(c, start_slot); - addReplyLongLong(c, end_slot); - addNodeToNodeReply(c, node); - - /* Remaining nodes in reply are replicas for slot range */ - for (i = 0; i < node->numslaves; i++) { - /* This loop is copy/pasted from clusterGenNodeDescription() - * with modifications for per-slot node aggregation. */ - if (!isReplicaAvailable(node->slaves[i])) continue; - addNodeToNodeReply(c, node->slaves[i]); - nested_elements--; - } - serverAssert(nested_elements == 3); /* Original 3 elements */ -} - -/* Add detailed information of a node to the output buffer of the given client. */ -void addNodeDetailsToShardReply(client *c, clusterNode *node) { - int reply_count = 0; - void *node_replylen = addReplyDeferredLen(c); - addReplyBulkCString(c, "id"); - addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); - reply_count++; - - if (node->tcp_port) { - addReplyBulkCString(c, "port"); - addReplyLongLong(c, node->tcp_port); - reply_count++; - } - - if (node->tls_port) { - addReplyBulkCString(c, "tls-port"); - addReplyLongLong(c, node->tls_port); - reply_count++; - } - - addReplyBulkCString(c, "ip"); - addReplyBulkCString(c, node->ip); - reply_count++; - - addReplyBulkCString(c, "endpoint"); - addReplyBulkCString(c, getPreferredEndpoint(node)); - reply_count++; - - if (sdslen(node->hostname) != 0) { - addReplyBulkCString(c, "hostname"); - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - reply_count++; - } - - long long node_offset; - if (node->flags & CLUSTER_NODE_MYSELF) { - node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset; - } else { - node_offset = node->repl_offset; - } - - addReplyBulkCString(c, "role"); - addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master"); - reply_count++; - - addReplyBulkCString(c, "replication-offset"); - addReplyLongLong(c, node_offset); - reply_count++; - - addReplyBulkCString(c, "health"); - const char *health_msg = NULL; - if (nodeFailed(node)) { - health_msg = "fail"; - } else if (nodeIsSlave(node) && node_offset == 0) { - health_msg = "loading"; - } else { - health_msg = "online"; - } - addReplyBulkCString(c, health_msg); - reply_count++; - - setDeferredMapLen(c, node_replylen, reply_count); -} - -/* Add the shard reply of a single shard based off the given primary node. */ -void addShardReplyForClusterShards(client *c, list *nodes) { - serverAssert(listLength(nodes) > 0); - clusterNode *n = listNodeValue(listFirst(nodes)); - addReplyMapLen(c, 2); - addReplyBulkCString(c, "slots"); - - /* Use slot_info_pairs from the primary only */ - while (n->slaveof != NULL) n = n->slaveof; - - if (n->slot_info_pairs != NULL) { - serverAssert((n->slot_info_pairs_count % 2) == 0); - addReplyArrayLen(c, n->slot_info_pairs_count); - for (int i = 0; i < n->slot_info_pairs_count; i++) - addReplyLongLong(c, (unsigned long)n->slot_info_pairs[i]); - } else { - /* If no slot info pair is provided, the node owns no slots */ - addReplyArrayLen(c, 0); - } - - addReplyBulkCString(c, "nodes"); - addReplyArrayLen(c, listLength(nodes)); - listIter li; - listRewind(nodes, &li); - for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) { - clusterNode *n = listNodeValue(ln); - addNodeDetailsToShardReply(c, n); - clusterFreeNodesSlotsInfo(n); - } -} - -/* Add to the output buffer of the given client, an array of slot (start, end) - * pair owned by the shard, also the primary and set of replica(s) along with - * information about each node. */ -void clusterReplyShards(client *c) { - addReplyArrayLen(c, dictSize(server.cluster->shards)); - /* This call will add slot_info_pairs to all nodes */ - clusterGenNodesSlotsInfo(0); - dictIterator *di = dictGetSafeIterator(server.cluster->shards); - for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) { - addShardReplyForClusterShards(c, dictGetVal(de)); - } - dictReleaseIterator(di); -} - -void clusterReplyMultiBulkSlots(client * c) { - /* Format: 1) 1) start slot - * 2) end slot - * 3) 1) master IP - * 2) master port - * 3) node ID - * 4) 1) replica IP - * 2) replica port - * 3) node ID - * ... continued until done - */ - clusterNode *n = NULL; - int num_masters = 0, start = -1; - void *slot_replylen = addReplyDeferredLen(c); - - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - /* Find start node and slot id. */ - if (n == NULL) { - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - continue; - } - - /* Add cluster slots info when occur different node with start - * or end of slot. */ - if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { - addNodeReplyForClusterSlot(c, n, start, i-1); - num_masters++; - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - } - } - setDeferredArrayLen(c, slot_replylen, num_masters); -} - -sds genClusterInfoString(void) { - sds info = sdsempty(); - char *statestr[] = {"ok","fail"}; - int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0; - uint64_t myepoch; - int j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - clusterNode *n = server.cluster->slots[j]; - - if (n == NULL) continue; - slots_assigned++; - if (nodeFailed(n)) { - slots_fail++; - } else if (nodeTimedOut(n)) { - slots_pfail++; - } else { - slots_ok++; - } - } - - myepoch = (nodeIsSlave(myself) && myself->slaveof) ? - myself->slaveof->configEpoch : myself->configEpoch; - - info = sdscatprintf(info, - "cluster_state:%s\r\n" - "cluster_slots_assigned:%d\r\n" - "cluster_slots_ok:%d\r\n" - "cluster_slots_pfail:%d\r\n" - "cluster_slots_fail:%d\r\n" - "cluster_known_nodes:%lu\r\n" - "cluster_size:%d\r\n" - "cluster_current_epoch:%llu\r\n" - "cluster_my_epoch:%llu\r\n" - , statestr[server.cluster->state], - slots_assigned, - slots_ok, - slots_pfail, - slots_fail, - dictSize(server.cluster->nodes), - server.cluster->size, - (unsigned long long) server.cluster->currentEpoch, - (unsigned long long) myepoch - ); - - /* Show stats about messages sent and received. */ - long long tot_msg_sent = 0; - long long tot_msg_received = 0; - - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - if (server.cluster->stats_bus_messages_sent[i] == 0) continue; - tot_msg_sent += server.cluster->stats_bus_messages_sent[i]; - info = sdscatprintf(info, - "cluster_stats_messages_%s_sent:%lld\r\n", - clusterGetMessageTypeString(i), - server.cluster->stats_bus_messages_sent[i]); - } - info = sdscatprintf(info, - "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent); - - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - if (server.cluster->stats_bus_messages_received[i] == 0) continue; - tot_msg_received += server.cluster->stats_bus_messages_received[i]; - info = sdscatprintf(info, - "cluster_stats_messages_%s_received:%lld\r\n", - clusterGetMessageTypeString(i), - server.cluster->stats_bus_messages_received[i]); - } - info = sdscatprintf(info, - "cluster_stats_messages_received:%lld\r\n", tot_msg_received); - - info = sdscatprintf(info, - "total_cluster_links_buffer_limit_exceeded:%llu\r\n", - server.cluster->stat_cluster_links_buffer_limit_exceeded); - - return info; -} - -void clusterCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { - const char *help[] = { -"ADDSLOTS [ ...]", -" Assign slots to current node.", -"ADDSLOTSRANGE [ ...]", -" Assign slots which are between and to current node.", -"BUMPEPOCH", -" Advance the cluster config epoch.", -"COUNT-FAILURE-REPORTS ", -" Return number of failure reports for .", -"COUNTKEYSINSLOT ", -" Return the number of keys in .", -"DELSLOTS [ ...]", -" Delete slots information from current node.", -"DELSLOTSRANGE [ ...]", -" Delete slots information which are between and from current node.", -"FAILOVER [FORCE|TAKEOVER]", -" Promote current replica node to being a master.", -"FORGET ", -" Remove a node from the cluster.", -"GETKEYSINSLOT ", -" Return key names stored by current node in a slot.", -"FLUSHSLOTS", -" Delete current node own slots information.", -"INFO", -" Return information about the cluster.", -"KEYSLOT ", -" Return the hash slot for .", -"MEET []", -" Connect nodes into a working cluster.", -"MYID", -" Return the node id.", -"MYSHARDID", -" Return the node's shard id.", -"NODES", -" Return cluster configuration seen by node. Output format:", -" ...", -"REPLICATE ", -" Configure current node as replica to .", -"RESET [HARD|SOFT]", -" Reset current node (default: soft).", -"SET-CONFIG-EPOCH ", -" Set config epoch of current node.", -"SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", -" Set slot state.", -"REPLICAS ", -" Return replicas.", -"SAVECONFIG", -" Force saving cluster configuration on disk.", -"SLOTS", -" Return information about slots range mappings. Each range is made of:", -" start, end, master and replicas IP addresses, ports and ids", -"SHARDS", -" Return information about slot range mappings and the nodes associated with them.", -"LINKS", -" Return information about all network links between this node and its peers.", -" Output format is an array where each array element is a map containing attributes of a link", -NULL - }; - addReplyHelp(c, help); - } else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { - /* CLUSTER MEET [cport] */ - long long port, cport; - - if (getLongLongFromObject(c->argv[3], &port) != C_OK) { - addReplyErrorFormat(c,"Invalid base port specified: %s", - (char*)c->argv[3]->ptr); - return; - } - - if (c->argc == 5) { - if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { - addReplyErrorFormat(c,"Invalid bus port specified: %s", - (char*)c->argv[4]->ptr); - return; - } - } else { - cport = port + CLUSTER_PORT_INCR; - } - - if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && - errno == EINVAL) - { - addReplyErrorFormat(c,"Invalid node address specified: %s:%s", - (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); - } else { - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { - /* CLUSTER NODES */ - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); - addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); - sdsfree(nodes); - } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { - /* CLUSTER MYID */ - addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { - /* CLUSTER MYSHARDID */ - addReplyBulkCBuffer(c,myself->shard_id, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { - /* CLUSTER SLOTS */ - clusterReplyMultiBulkSlots(c); - } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { - /* CLUSTER SHARDS */ - clusterReplyShards(c); - } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { - /* CLUSTER FLUSHSLOTS */ - if (dbSize(&server.db[0], DB_MAIN) != 0) { - addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); - return; - } - clusterDelNodeSlots(myself); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || - !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) - { - /* CLUSTER ADDSLOTS [slot] ... */ - /* CLUSTER DELSLOTS [slot] ... */ - int j, slot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslots"); - - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable.*/ - for (j = 2; j < c->argc; j++) { - if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - } - /* Check that the slots are not already busy. */ - for (j = 2; j < c->argc; j++) { - slot = getSlotOrReply(c,c->argv[j]); - if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || - !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { - if (c->argc % 2 == 1) { - addReplyErrorArity(c); - return; - } - /* CLUSTER ADDSLOTSRANGE [ ...] */ - /* CLUSTER DELSLOTSRANGE [ ...] */ - int j, startslot, endslot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); - - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable and that all the - * slots are not already busy. */ - for (j = 2; j < c->argc; j += 2) { - if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { - zfree(slots); - return; - } - if (startslot > endslot) { - addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); - zfree(slots); - return; - } - - if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { - /* SETSLOT 10 MIGRATING */ - /* SETSLOT 10 IMPORTING */ - /* SETSLOT 10 STABLE */ - /* SETSLOT 10 NODE */ - int slot; - clusterNode *n; - - if (nodeIsSlave(myself)) { - addReplyError(c,"Please use SETSLOT only with masters."); - return; - } - - if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return; - - if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { - if (server.cluster->slots[slot] != myself) { - addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->migrating_slots_to[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { - if (server.cluster->slots[slot] == myself) { - addReplyErrorFormat(c, - "I'm already the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->importing_slots_from[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { - /* CLUSTER SETSLOT STABLE */ - server.cluster->importing_slots_from[slot] = NULL; - server.cluster->migrating_slots_to[slot] = NULL; - } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { - /* CLUSTER SETSLOT NODE */ - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - /* If this hash slot was served by 'myself' before to switch - * make sure there are no longer local keys for this hash slot. */ - if (server.cluster->slots[slot] == myself && n != myself) { - if (countKeysInSlot(slot) != 0) { - addReplyErrorFormat(c, - "Can't assign hashslot %d to a different node " - "while I still hold keys for this hash slot.", slot); - return; - } - } - /* If this slot is in migrating status but we have no keys - * for it assigning the slot to another node will clear - * the migrating status. */ - if (countKeysInSlot(slot) == 0 && - server.cluster->migrating_slots_to[slot]) - server.cluster->migrating_slots_to[slot] = NULL; - - int slot_was_mine = server.cluster->slots[slot] == myself; - clusterDelSlot(slot); - clusterAddSlot(n,slot); - - /* If we are a master left without slots, we should turn into a - * replica of the new master. */ - if (slot_was_mine && - n != myself && - myself->numslots == 0 && - server.cluster_allow_replica_migration) - { - serverLog(LL_NOTICE, - "Configuration change detected. Reconfiguring myself " - "as a replica of %.40s (%s)", n->name, n->human_nodename); - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | - CLUSTER_TODO_UPDATE_STATE | - CLUSTER_TODO_FSYNC_CONFIG); - } - - /* If this node was importing this slot, assigning the slot to - * itself also clears the importing status. */ - if (n == myself && - server.cluster->importing_slots_from[slot]) - { - /* This slot was manually migrated, set this node configEpoch - * to a new epoch so that the new version can be propagated - * by the cluster. - * - * Note that if this ever results in a collision with another - * node getting the same configEpoch, for example because a - * failover happens at the same time we close the slot, the - * configEpoch collision resolution will fix it assigning - * a different epoch to each node. */ - if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, - "configEpoch updated after importing slot %d", slot); - } - server.cluster->importing_slots_from[slot] = NULL; - /* After importing this slot, let the other nodes know as - * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - } - } else { - addReplyError(c, - "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); - return; - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { - /* CLUSTER BUMPEPOCH */ - int retval = clusterBumpConfigEpochWithoutConsensus(); - sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", - (retval == C_OK) ? "BUMPED" : "STILL", - (unsigned long long) myself->configEpoch); - addReplySds(c,reply); - } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { - /* CLUSTER INFO */ - - sds info = genClusterInfoString(); - - /* Produce the reply protocol. */ - addReplyVerbatim(c,info,sdslen(info),"txt"); - sdsfree(info); - } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { - int retval = clusterSaveConfig(1); - - if (retval == 0) - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"error saving the cluster node config: %s", - strerror(errno)); - } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { - /* CLUSTER KEYSLOT */ - sds key = c->argv[2]->ptr; - - addReplyLongLong(c,keyHashSlot(key,sdslen(key))); - } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { - /* CLUSTER COUNTKEYSINSLOT */ - long long slot; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS) { - addReplyError(c,"Invalid slot"); - return; - } - addReplyLongLong(c,countKeysInSlot(slot)); - } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { - /* CLUSTER GETKEYSINSLOT */ - long long maxkeys, slot; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) - != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { - addReplyError(c,"Invalid slot or number of keys"); - return; - } - - unsigned int keys_in_slot = countKeysInSlot(slot); - unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; - addReplyArrayLen(c,numkeys); - dictIterator *iter = NULL; - dictEntry *de = NULL; - iter = dictGetIterator(server.db->dict[slot]); - for (unsigned int i = 0; i < numkeys; i++) { - de = dictNext(iter); - serverAssert(de != NULL); - sds sdskey = dictGetKey(de); - addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); - } - dictReleaseIterator(iter); - } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { - /* CLUSTER FORGET */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - if (clusterBlacklistExists((char*)c->argv[2]->ptr)) - /* Already forgotten. The deletion may have been gossipped by - * another node, so we pretend it succeeded. */ - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else if (n == myself) { - addReplyError(c,"I tried hard but I can't forget myself..."); - return; - } else if (nodeIsSlave(myself) && myself->slaveof == n) { - addReplyError(c,"Can't forget my master!"); - return; - } - clusterBlacklistAddNode(n); - clusterDelNode(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { - /* CLUSTER REPLICATE */ - /* Lookup the specified node in our table. */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } - - /* I can't replicate myself. */ - if (n == myself) { - addReplyError(c,"Can't replicate myself"); - return; - } - - /* Can't replicate a slave. */ - if (nodeIsSlave(n)) { - addReplyError(c,"I can only replicate a master, not a replica."); - return; - } - - /* If the instance is currently a master, it should have no assigned - * slots nor keys to accept to replicate some other node. - * Slaves can switch to another master without issues. */ - if (nodeIsMaster(myself) && - (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) { - addReplyError(c, - "To set a master the node must be empty and " - "without assigned slots."); - return; - } - - /* Set the master. */ - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || - !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { - /* CLUSTER SLAVES */ - /* CLUSTER REPLICAS */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - int j; - - /* Lookup the specified node in our table. */ - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } - - if (nodeIsSlave(n)) { - addReplyError(c,"The specified node is not a master"); - return; - } - - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyArrayLen(c,n->numslaves); - for (j = 0; j < n->numslaves; j++) { - sds ni = clusterGenNodeDescription(c, n->slaves[j], shouldReturnTlsInfo()); - addReplyBulkCString(c,ni); - sdsfree(ni); - } - } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && - c->argc == 3) - { - /* CLUSTER COUNT-FAILURE-REPORTS */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else { - addReplyLongLong(c,clusterNodeFailureReportsCount(n)); - } - } else if (!strcasecmp(c->argv[1]->ptr,"failover") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ - int force = 0, takeover = 0; - - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"force")) { - force = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { - takeover = 1; - force = 1; /* Takeover also implies force. */ - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } - - /* Check preconditions. */ - if (nodeIsMaster(myself)) { - addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); - return; - } else if (myself->slaveof == NULL) { - addReplyError(c,"I'm a replica but my master is unknown to me"); - return; - } else if (!force && - (nodeFailed(myself->slaveof) || - myself->slaveof->link == NULL)) - { - addReplyError(c,"Master is down or failed, " - "please use CLUSTER FAILOVER FORCE"); - return; - } - resetManualFailover(); - server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; - - if (takeover) { - /* A takeover does not perform any initial check. It just - * generates a new configuration epoch for this node without - * consensus, claims the master's slots, and broadcast the new - * configuration. */ - serverLog(LL_NOTICE,"Taking over the master (user request)."); - clusterBumpConfigEpochWithoutConsensus(); - clusterFailoverReplaceYourMaster(); - } else if (force) { - /* If this is a forced failover, we don't need to talk with our - * master to agree about the offset. We just failover taking over - * it without coordination. */ - serverLog(LL_NOTICE,"Forced failover user request accepted."); - server.cluster->mf_can_start = 1; - } else { - serverLog(LL_NOTICE,"Manual failover user request accepted."); - clusterSendMFStart(myself->slaveof); - } - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) - { - /* CLUSTER SET-CONFIG-EPOCH - * - * The user is allowed to set the config epoch only when a node is - * totally fresh: no config epoch, no other known node, and so forth. - * This happens at cluster creation time to start with a cluster where - * every node has a different node ID, without to rely on the conflicts - * resolution system which is too slow when a big cluster is created. */ - long long epoch; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) - return; - - if (epoch < 0) { - addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); - } else if (dictSize(server.cluster->nodes) > 1) { - addReplyError(c,"The user can assign a config epoch only when the " - "node does not know any other node."); - } else if (myself->configEpoch != 0) { - addReplyError(c,"Node config epoch is already non-zero"); - } else { - myself->configEpoch = epoch; - serverLog(LL_NOTICE, - "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", - (unsigned long long) myself->configEpoch); - - if (server.cluster->currentEpoch < (uint64_t)epoch) - server.cluster->currentEpoch = epoch; - /* No need to fsync the config here since in the unlucky event - * of a failure to persist the config, the conflict resolution code - * will assign a unique config to this node. */ - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"reset") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER RESET [SOFT|HARD] */ - int hard = 0; - - /* Parse soft/hard argument. Default is soft. */ - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"hard")) { - hard = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { - hard = 0; - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } - - /* Slaves can be reset while containing data, but not master nodes - * that must be empty. */ - if (nodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { - addReplyError(c,"CLUSTER RESET can't be called with " - "master nodes containing keys"); - return; - } - clusterReset(hard); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { - /* CLUSTER LINKS */ - addReplyClusterLinksDescription(c); - } else { - addReplySubcommandSyntaxError(c); - return; - } -} - -void removeChannelsInSlot(unsigned int slot) { - unsigned int channelcount = countChannelsInSlot(slot); - if (channelcount == 0) return; - - /* Retrieve all the channels for the slot. */ - robj **channels = zmalloc(sizeof(robj*)*channelcount); - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (slot >> 8) & 0xff; - indexed[1] = slot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); - } - raxStop(&iter); - - pubsubUnsubscribeShardChannels(channels, channelcount); - zfree(channels); + return connectionTypeTcp(); } /* ----------------------------------------------------------------------------- @@ -6638,7 +176,7 @@ void restoreCommand(client *c) { lfu_freq == -1) { if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL) - != C_OK) return; + != C_OK) return; if (lru_idle < 0) { addReplyError(c,"Invalid IDLETIME value, must be >= 0"); return; @@ -6649,7 +187,7 @@ void restoreCommand(client *c) { lru_idle == -1) { if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL) - != C_OK) return; + != C_OK) return; if (lfu_freq < 0 || lfu_freq > 255) { addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255"); return; @@ -6728,7 +266,6 @@ void restoreCommand(client *c) { addReply(c,shared.ok); server.dirty++; } - /* MIGRATE socket cache implementation. * * We take a map between host:ip and a TCP socket that we used to connect @@ -6784,7 +321,7 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti /* Create the connection */ conn = connCreate(connTypeOfCluster()); if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) - != C_OK) { + != C_OK) { addReplyError(c,"-IOERR error or timeout connecting to the client"); connClose(conn); sdsfree(name); @@ -6891,8 +428,8 @@ void migrateCommand(client *c) { } else if (!strcasecmp(c->argv[j]->ptr,"keys")) { if (sdslen(c->argv[3]->ptr) != 0) { addReplyError(c, - "When using MIGRATE KEYS option, the key argument" - " must be set to the empty string"); + "When using MIGRATE KEYS option, the key argument" + " must be set to the empty string"); return; } first_key = j+1; @@ -6934,7 +471,7 @@ void migrateCommand(client *c) { return; } -try_again: + try_again: write_error = 0; /* Connect */ @@ -6953,10 +490,10 @@ try_again: serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4)); if (username) { serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username, - sdslen(username))); + sdslen(username))); } serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password, - sdslen(password))); + sdslen(password))); } /* Send the SELECT command if the current DB is not already selected. */ @@ -6992,24 +529,24 @@ try_again: kv[non_expired++] = kv[j]; serverAssertWithInfo(c,NULL, - rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); + rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); if (server.cluster_enabled) serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); + rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); else serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j])); serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr, - sdslen(kv[j]->ptr))); + sdslen(kv[j]->ptr))); serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); /* Emit the payload argument, that is the serialized object using * the DUMP format. */ createDumpPayload(&payload,ov[j],kv[j],dbid); serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,payload.io.buffer.ptr, - sdslen(payload.io.buffer.ptr))); + rioWriteBulkString(&cmd,payload.io.buffer.ptr, + sdslen(payload.io.buffer.ptr))); sdsfree(payload.io.buffer.ptr); /* Add the REPLACE option to the RESTORE command if it was specified @@ -7081,7 +618,7 @@ try_again: error_from_target = 1; addReplyErrorFormat(c,"Target instance replied with error: %s", - errbuf+1); + errbuf+1); } } else { if (!copy) { @@ -7157,7 +694,7 @@ try_again: /* On socket errors we try to close the cached socket and try again. * It is very common for the cached socket to get closed, if just reopening * it works it's a shame to notify the error to the caller. */ -socket_err: + socket_err: /* Cleanup we want to perform in both the retry and no retry case. * Note: Closing the migrate socket will also force SELECT next time. */ sdsfree(cmd.io.buffer.ptr); @@ -7180,11 +717,693 @@ socket_err: /* Cleanup we want to do if no retry is attempted. */ zfree(ov); zfree(kv); addReplyErrorSds(c, sdscatprintf(sdsempty(), - "-IOERR error or timeout %s to target instance", - write_error ? "writing" : "reading")); + "-IOERR error or timeout %s to target instance", + write_error ? "writing" : "reading")); return; } +/* Cluster node sanity check. Returns C_OK if the node id + * is valid an C_ERR otherwise. */ +int verifyClusterNodeId(const char *name, int length) { + if (length != CLUSTER_NAMELEN) return C_ERR; + for (int i = 0; i < length; i++) { + if (name[i] >= 'a' && name[i] <= 'z') continue; + if (name[i] >= '0' && name[i] <= '9') continue; + return C_ERR; + } + return C_OK; +} + +int isValidAuxChar(int c) { + return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); +} + +int isValidAuxString(char *s, unsigned int length) { + for (unsigned i = 0; i < length; i++) { + if (!isValidAuxChar(s[i])) return 0; + } + return 1; +} + +void clusterCommandMyId(client *c) { + char *name = clusterNodeGetName(getMyClusterNode()); + if (name) { + addReplyBulkCBuffer(c,name, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No ID yet"); + } +} + +char* getMyClusterId(void) { + return clusterNodeGetName(getMyClusterNode()); +} + +void clusterCommandMyShardId(client *c) { + char *sid = clusterNodeGetShardId(getMyClusterNode()); + if (sid) { + addReplyBulkCBuffer(c,sid, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No shard ID yet"); + } +} + +/* When a cluster command is called, we need to decide whether to return TLS info or + * non-TLS info by the client's connection type. However if the command is called by + * a Lua script or RM_call, there is no connection in the fake client, so we use + * server.current_client here to get the real client if available. And if it is not + * available (modules may call commands without a real client), we return the default + * info, which is determined by server.tls_cluster. */ +static int shouldReturnTlsInfo(void) { + if (server.current_client && server.current_client->conn) { + return connIsTLS(server.current_client->conn); + } else { + return server.tls_cluster; + } +} + +unsigned int countKeysInSlot(unsigned int slot) { + return dictSize(server.db->dict[slot]); +} + +void clusterCommandHelp(client *c) { + const char *help[] = { + "COUNTKEYSINSLOT ", + " Return the number of keys in .", + "GETKEYSINSLOT ", + " Return key names stored by current node in a slot.", + "INFO", + " Return information about the cluster.", + "KEYSLOT ", + " Return the hash slot for .", + "MYID", + " Return the node id.", + "MYSHARDID", + " Return the node's shard id.", + "NODES", + " Return cluster configuration seen by node. Output format:", + " ...", + "REPLICAS ", + " Return replicas.", + "SLOTS", + " Return information about slots range mappings. Each range is made of:", + " start, end, master and replicas IP addresses, ports and ids", + "SHARDS", + " Return information about slot range mappings and the nodes associated with them.", + NULL + }; + + addExtendedReplyHelp(c, help, clusterCommandExtendedHelp()); +} + +void clusterCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { + clusterCommandHelp(c); + } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { + /* CLUSTER NODES */ + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); + addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); + sdsfree(nodes); + } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { + /* CLUSTER MYID */ + clusterCommandMyId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { + /* CLUSTER MYSHARDID */ + clusterCommandMyShardId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { + /* CLUSTER SLOTS */ + clusterCommandSlots(c); + } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { + /* CLUSTER SHARDS */ + clusterCommandShards(c); + } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { + /* CLUSTER INFO */ + + sds info = genClusterInfoString(); + + /* Produce the reply protocol. */ + addReplyVerbatim(c,info,sdslen(info),"txt"); + sdsfree(info); + } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { + /* CLUSTER KEYSLOT */ + sds key = c->argv[2]->ptr; + + addReplyLongLong(c,keyHashSlot(key,sdslen(key))); + } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { + /* CLUSTER COUNTKEYSINSLOT */ + long long slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS) { + addReplyError(c,"Invalid slot"); + return; + } + addReplyLongLong(c,countKeysInSlot(slot)); + } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { + /* CLUSTER GETKEYSINSLOT */ + long long maxkeys, slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) + != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { + addReplyError(c,"Invalid slot or number of keys"); + return; + } + + unsigned int keys_in_slot = countKeysInSlot(slot); + unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; + addReplyArrayLen(c,numkeys); + dictIterator *iter = NULL; + dictEntry *de = NULL; + iter = dictGetIterator(server.db->dict[slot]); + for (unsigned int i = 0; i < numkeys; i++) { + de = dictNext(iter); + serverAssert(de != NULL); + sds sdskey = dictGetKey(de); + addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); + } + dictReleaseIterator(iter); + } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || + !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { + /* CLUSTER SLAVES */ + /* CLUSTER REPLICAS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + int j; + + /* Lookup the specified node in our table. */ + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } + + if (clusterNodeIsSlave(n)) { + addReplyError(c,"The specified node is not a master"); + return; + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyArrayLen(c, clusterNodeNumSlaves(n)); + for (j = 0; j < clusterNodeNumSlaves(n); j++) { + sds ni = clusterGenNodeDescription(c, clusterNodeGetSlave(n, j), shouldReturnTlsInfo()); + addReplyBulkCString(c,ni); + sdsfree(ni); + } + } else if(!clusterCommandSpecial(c)) { + addReplySubcommandSyntaxError(c); + return; + } +} + +/* Return the pointer to the cluster node that is able to serve the command. + * For the function to succeed the command should only target either: + * + * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). + * 2) Multiple keys in the same hash slot, while the slot is stable (no + * resharding in progress). + * + * On success the function returns the node that is able to serve the request. + * If the node is not 'myself' a redirection must be performed. The kind of + * redirection is specified setting the integer passed by reference + * 'error_code', which will be set to CLUSTER_REDIR_ASK or + * CLUSTER_REDIR_MOVED. + * + * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. + * + * If the command fails NULL is returned, and the reason of the failure is + * provided via 'error_code', which will be set to: + * + * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that + * don't belong to the same hash slot. + * + * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys + * belonging to the same slot, but the slot is not stable (in migration or + * importing state, likely because a resharding is in progress). + * + * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is + * not bound to any node. In this case the cluster global state should be + * already "down" but it is fragile to rely on the update of the global state, + * so we also handle it here. + * + * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is + * down but the user attempts to execute a command that addresses one or more keys. */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { + clusterNode *myself = getMyClusterNode(); + clusterNode *n = NULL; + robj *firstkey = NULL; + int multiple_keys = 0; + multiState *ms, _ms; + multiCmd mc; + int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, + existing_keys = 0; + + /* Allow any key to be set if a module disabled cluster redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return myself; + + /* Set error code optimistically for the base case. */ + if (error_code) *error_code = CLUSTER_REDIR_NONE; + + /* Modules can turn off Redis Cluster redirection: this is useful + * when writing a module that implements a completely different + * distributed system. */ + + /* We handle all the cases as if they were EXEC commands, so we have + * a common code path for everything */ + if (cmd->proc == execCommand) { + /* If CLIENT_MULTI flag is not set EXEC is just going to return an + * error. */ + if (!(c->flags & CLIENT_MULTI)) return myself; + ms = &c->mstate; + } else { + /* In order to have a single codepath create a fake Multi State + * structure if the client is not in MULTI/EXEC state, this way + * we have a single codepath below. */ + ms = &_ms; + _ms.commands = &mc; + _ms.count = 1; + mc.argv = argv; + mc.argc = argc; + mc.cmd = cmd; + } + + int is_pubsubshard = cmd->proc == ssubscribeCommand || + cmd->proc == sunsubscribeCommand || + cmd->proc == spublishCommand; + + /* Check that all the keys are in the same hash slot, and obtain this + * slot and the node associated. */ + for (i = 0; i < ms->count; i++) { + struct redisCommand *mcmd; + robj **margv; + int margc, numkeys, j; + keyReference *keyindex; + + mcmd = ms->commands[i].cmd; + margc = ms->commands[i].argc; + margv = ms->commands[i].argv; + + getKeysResult result = GETKEYS_RESULT_INIT; + numkeys = getKeysFromCommand(mcmd,margv,margc,&result); + keyindex = result.keys; + + for (j = 0; j < numkeys; j++) { + robj *thiskey = margv[keyindex[j].pos]; + int thisslot = keyHashSlot((char*)thiskey->ptr, + sdslen(thiskey->ptr)); + + if (firstkey == NULL) { + /* This is the first key we see. Check what is the slot + * and node. */ + firstkey = thiskey; + slot = thisslot; + n = getNodeBySlot(slot); + + /* Error: If a slot is not served, we are in "cluster down" + * state. However the state is yet to be updated, so this was + * not trapped earlier in processCommand(). Report the same + * error to the client. */ + if (n == NULL) { + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_DOWN_UNBOUND; + return NULL; + } + + /* If we are migrating or importing this slot, we need to check + * if we have all the keys in the request (the only way we + * can safely serve the request, otherwise we return a TRYAGAIN + * error). To do so we set the importing/migrating state and + * increment a counter for every missing key. */ + if (n == myself && + getMigratingSlotDest(slot) != NULL) + { + migrating_slot = 1; + } else if (getImportingSlotSource(slot) != NULL) { + importing_slot = 1; + } + } else { + /* If it is not the first key/channel, make sure it is exactly + * the same key/channel as the first we saw. */ + if (slot != thisslot) { + /* Error: multiple keys from different slots. */ + getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } + if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { + /* Flag this request as one with multiple different + * keys/channels when the slot is in importing state. */ + multiple_keys = 1; + } + } + + /* Migrating / Importing slot? Count keys we don't have. + * If it is pubsubshard command, it isn't required to check + * the channel being present or not in the node during the + * slot migration, the channel will be served from the source + * node until the migration completes with CLUSTER SETSLOT + * NODE . */ + int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; + if ((migrating_slot || importing_slot) && !is_pubsubshard) + { + if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; + else existing_keys++; + } + } + getKeysFreeResult(&result); + } + + /* No key at all in command? then we can serve the request + * without redirections or errors in all the cases. */ + if (n == NULL) return myself; + + uint64_t cmd_flags = getCommandFlags(c); + /* Cluster is globally down but we got keys? We only serve the request + * if it is a read command and when allow_reads_when_down is enabled. */ + if (!isClusterHealthy()) { + if (is_pubsubshard) { + if (!server.cluster_allow_pubsubshard_when_down) { + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } + } else if (!server.cluster_allow_reads_when_down) { + /* The cluster is configured to block commands when the + * cluster is down. */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } else if (cmd_flags & CMD_WRITE) { + /* The cluster is configured to allow read only commands */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; + return NULL; + } else { + /* Fall through and allow the command to be executed: + * this happens when server.cluster_allow_reads_when_down is + * true and the command is not a write command */ + } + } + + /* Return the hashslot by reference. */ + if (hashslot) *hashslot = slot; + + /* MIGRATE always works in the context of the local node if the slot + * is open (migrating or importing state). We need to be able to freely + * move keys among instances in this case. */ + if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) + return myself; + + /* If we don't have all the keys and we are migrating the slot, send + * an ASK redirection or TRYAGAIN. */ + if (migrating_slot && missing_keys) { + /* If we have keys but we don't have all keys, we return TRYAGAIN */ + if (existing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + if (error_code) *error_code = CLUSTER_REDIR_ASK; + return getMigratingSlotDest(slot); + } + } + + /* If we are receiving the slot, and the client correctly flagged the + * request as "ASKING", we can serve the request. However if the request + * involves multiple keys and we don't have them all, the only option is + * to send a TRYAGAIN error. */ + if (importing_slot && + (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) + { + if (multiple_keys && missing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + return myself; + } + } + + /* Handle the read-only client case reading from a slave: if this + * node is a slave and the request is about a hash slot our master + * is serving, we can reply without redirection. */ + int is_write_command = (cmd_flags & CMD_WRITE) || + (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && + !is_write_command && + clusterNodeIsSlave(myself) && + clusterNodeGetSlaveof(myself) == n) + { + return myself; + } + + /* Base case: just return the right node. However, if this node is not + * myself, set error_code to MOVED since we need to issue a redirection. */ + if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; + return n; +} + +/* Send the client the right redirection code, according to error_code + * that should be set to one of CLUSTER_REDIR_* macros. + * + * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes + * are used, then the node 'n' should not be NULL, but should be the + * node we want to mention in the redirection. Moreover hashslot should + * be set to the hash slot that caused the redirection. */ +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { + if (error_code == CLUSTER_REDIR_CROSS_SLOT) { + addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); + } else if (error_code == CLUSTER_REDIR_UNSTABLE) { + /* The request spawns multiple keys in the same slot, + * but the slot is not "stable" currently as there is + * a migration or import in progress. */ + addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); + } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down"); + } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); + } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { + addReplyError(c,"-CLUSTERDOWN Hash slot not served"); + } else if (error_code == CLUSTER_REDIR_MOVED || + error_code == CLUSTER_REDIR_ASK) + { + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + int port = clusterNodeClientPort(n, shouldReturnTlsInfo()); + addReplyErrorSds(c,sdscatprintf(sdsempty(), + "-%s %d %s:%d", + (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", + hashslot, clusterNodePreferredEndpoint(n), port)); + } else { + serverPanic("getNodeByQuery() unknown error."); + } +} + +/* This function is called by the function processing clients incrementally + * to detect timeouts, in order to handle the following case: + * + * 1) A client blocks with BLPOP or similar blocking operation. + * 2) The master migrates the hash slot elsewhere or turns into a slave. + * 3) The client may remain blocked forever (or up to the max timeout time) + * waiting for a key change that will never happen. + * + * If the client is found to be blocked into a hash slot this node no + * longer handles, the client is sent a redirection error, and the function + * returns 1. Otherwise 0 is returned and no operation is performed. */ +int clusterRedirectBlockedClientIfNeeded(client *c) { + clusterNode *myself = getMyClusterNode(); + if (c->flags & CLIENT_BLOCKED && + (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_MODULE)) + { + dictEntry *de; + dictIterator *di; + + /* If the cluster is down, unblock the client with the right error. + * If the cluster is configured to allow reads on cluster down, we + * still want to emit this error since a write will be required + * to unblock them which may never come. */ + if (!isClusterHealthy()) { + clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); + return 1; + } + + /* If the client is blocked on module, but not on a specific key, + * don't unblock it (except for the CLUSTER_FAIL case above). */ + if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) + return 0; + + /* All keys must belong to the same slot, so check first key only. */ + di = dictGetIterator(c->bstate.keys); + if ((de = dictNext(di)) != NULL) { + robj *key = dictGetKey(de); + int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); + clusterNode *node = getNodeBySlot(slot); + + /* if the client is read-only and attempting to access key that our + * replica can handle, allow it. */ + if ((c->flags & CLIENT_READONLY) && + !(c->lastcmd->flags & CMD_WRITE) && + clusterNodeIsSlave(myself) && clusterNodeGetSlaveof(myself) == node) + { + node = myself; + } + + /* We send an error and unblock the client if: + * 1) The slot is unassigned, emitting a cluster down error. + * 2) The slot is not handled by this node, nor being imported. */ + if (node != myself && getImportingSlotSource(slot) == NULL) + { + if (node == NULL) { + clusterRedirectClient(c,NULL,0, + CLUSTER_REDIR_DOWN_UNBOUND); + } else { + clusterRedirectClient(c,node,slot, + CLUSTER_REDIR_MOVED); + } + dictReleaseIterator(di); + return 1; + } + } + dictReleaseIterator(di); + } + return 0; +} + +/* Returns an indication if the replica node is fully available + * and should be listed in CLUSTER SLOTS response. + * Returns 1 for available nodes, 0 for nodes that have + * not finished their initial sync, in failed state, or are + * otherwise considered not available to serve read commands. */ +static int isReplicaAvailable(clusterNode *node) { + if (clusterNodeIsFailing(node)) { + return 0; + } + long long repl_offset = clusterNodeReplOffset(node); + if (clusterNodeIsMyself(node)) { + /* Nodes do not update their own information + * in the cluster node list. */ + repl_offset = replicationGetSlaveOffset(); + } + return (repl_offset != 0); +} + +void addNodeToNodeReply(client *c, clusterNode *node) { + char* hostname = clusterNodeHostname(node); + addReplyArrayLen(c, 4); + if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, clusterNodeIp(node)); + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { + if (hostname != NULL && hostname[0] != '\0') { + addReplyBulkCString(c, hostname); + } else { + addReplyBulkCString(c, "?"); + } + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { + addReplyNull(c); + } else { + serverPanic("Unrecognized preferred endpoint type"); + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyLongLong(c, clusterNodeClientPort(node, shouldReturnTlsInfo())); + addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); + + /* Add the additional endpoint information, this is all the known networking information + * that is not the preferred endpoint. Note the logic is evaluated twice so we can + * correctly report the number of additional network arguments without using a deferred + * map, an assertion is made at the end to check we set the right length. */ + int length = 0; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + length++; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + length++; + } + addReplyMapLen(c, length); + + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, clusterNodeIp(node)); + length--; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + addReplyBulkCString(c, "hostname"); + addReplyBulkCString(c, hostname); + length--; + } + serverAssert(length == 0); +} + +void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { + int i, nested_elements = 3; /* slots (2) + master addr (1) */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + nested_elements++; + } + addReplyArrayLen(c, nested_elements); + addReplyLongLong(c, start_slot); + addReplyLongLong(c, end_slot); + addNodeToNodeReply(c, node); + + /* Remaining nodes in reply are replicas for slot range */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + /* This loop is copy/pasted from clusterGenNodeDescription() + * with modifications for per-slot node aggregation. */ + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + addNodeToNodeReply(c, clusterNodeGetSlave(node, i)); + nested_elements--; + } + serverAssert(nested_elements == 3); /* Original 3 elements */ +} + +void clusterCommandSlots(client * c) { + /* Format: 1) 1) start slot + * 2) end slot + * 3) 1) master IP + * 2) master port + * 3) node ID + * 4) 1) replica IP + * 2) replica port + * 3) node ID + * ... continued until done + */ + clusterNode *n = NULL; + int num_masters = 0, start = -1; + void *slot_replylen = addReplyDeferredLen(c); + + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + continue; + } + + /* Add cluster slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != getNodeBySlot(i)) { + addNodeReplyForClusterSlot(c, n, start, i-1); + num_masters++; + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + } + } + setDeferredArrayLen(c, slot_replylen, num_masters); +} + /* ----------------------------------------------------------------------------- * Cluster functions related to serving / redirecting clients * -------------------------------------------------------------------------- */ @@ -7223,435 +1442,3 @@ void readwriteCommand(client *c) { c->flags &= ~CLIENT_READONLY; addReply(c,shared.ok); } - -/* Return the pointer to the cluster node that is able to serve the command. - * For the function to succeed the command should only target either: - * - * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). - * 2) Multiple keys in the same hash slot, while the slot is stable (no - * resharding in progress). - * - * On success the function returns the node that is able to serve the request. - * If the node is not 'myself' a redirection must be performed. The kind of - * redirection is specified setting the integer passed by reference - * 'error_code', which will be set to CLUSTER_REDIR_ASK or - * CLUSTER_REDIR_MOVED. - * - * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. - * - * If the command fails NULL is returned, and the reason of the failure is - * provided via 'error_code', which will be set to: - * - * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that - * don't belong to the same hash slot. - * - * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys - * belonging to the same slot, but the slot is not stable (in migration or - * importing state, likely because a resharding is in progress). - * - * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is - * not bound to any node. In this case the cluster global state should be - * already "down" but it is fragile to rely on the update of the global state, - * so we also handle it here. - * - * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is - * down but the user attempts to execute a command that addresses one or more keys. */ -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { - clusterNode *n = NULL; - robj *firstkey = NULL; - int multiple_keys = 0; - multiState *ms, _ms; - multiCmd mc; - int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, - existing_keys = 0; - - /* Allow any key to be set if a module disabled cluster redirections. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return myself; - - /* Set error code optimistically for the base case. */ - if (error_code) *error_code = CLUSTER_REDIR_NONE; - - /* Modules can turn off Redis Cluster redirection: this is useful - * when writing a module that implements a completely different - * distributed system. */ - - /* We handle all the cases as if they were EXEC commands, so we have - * a common code path for everything */ - if (cmd->proc == execCommand) { - /* If CLIENT_MULTI flag is not set EXEC is just going to return an - * error. */ - if (!(c->flags & CLIENT_MULTI)) return myself; - ms = &c->mstate; - } else { - /* In order to have a single codepath create a fake Multi State - * structure if the client is not in MULTI/EXEC state, this way - * we have a single codepath below. */ - ms = &_ms; - _ms.commands = &mc; - _ms.count = 1; - mc.argv = argv; - mc.argc = argc; - mc.cmd = cmd; - } - - int is_pubsubshard = cmd->proc == ssubscribeCommand || - cmd->proc == sunsubscribeCommand || - cmd->proc == spublishCommand; - - /* Check that all the keys are in the same hash slot, and obtain this - * slot and the node associated. */ - for (i = 0; i < ms->count; i++) { - struct redisCommand *mcmd; - robj **margv; - int margc, numkeys, j; - keyReference *keyindex; - - mcmd = ms->commands[i].cmd; - margc = ms->commands[i].argc; - margv = ms->commands[i].argv; - - getKeysResult result = GETKEYS_RESULT_INIT; - numkeys = getKeysFromCommand(mcmd,margv,margc,&result); - keyindex = result.keys; - - for (j = 0; j < numkeys; j++) { - robj *thiskey = margv[keyindex[j].pos]; - int thisslot = keyHashSlot((char*)thiskey->ptr, - sdslen(thiskey->ptr)); - - if (firstkey == NULL) { - /* This is the first key we see. Check what is the slot - * and node. */ - firstkey = thiskey; - slot = thisslot; - n = server.cluster->slots[slot]; - - /* Error: If a slot is not served, we are in "cluster down" - * state. However the state is yet to be updated, so this was - * not trapped earlier in processCommand(). Report the same - * error to the client. */ - if (n == NULL) { - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_DOWN_UNBOUND; - return NULL; - } - - /* If we are migrating or importing this slot, we need to check - * if we have all the keys in the request (the only way we - * can safely serve the request, otherwise we return a TRYAGAIN - * error). To do so we set the importing/migrating state and - * increment a counter for every missing key. */ - if (n == myself && - server.cluster->migrating_slots_to[slot] != NULL) - { - migrating_slot = 1; - } else if (server.cluster->importing_slots_from[slot] != NULL) { - importing_slot = 1; - } - } else { - /* If it is not the first key/channel, make sure it is exactly - * the same key/channel as the first we saw. */ - if (slot != thisslot) { - /* Error: multiple keys from different slots. */ - getKeysFreeResult(&result); - if (error_code) - *error_code = CLUSTER_REDIR_CROSS_SLOT; - return NULL; - } - if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { - /* Flag this request as one with multiple different - * keys/channels when the slot is in importing state. */ - multiple_keys = 1; - } - } - - /* Migrating / Importing slot? Count keys we don't have. - * If it is pubsubshard command, it isn't required to check - * the channel being present or not in the node during the - * slot migration, the channel will be served from the source - * node until the migration completes with CLUSTER SETSLOT - * NODE . */ - int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; - if ((migrating_slot || importing_slot) && !is_pubsubshard) - { - if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; - else existing_keys++; - } - } - getKeysFreeResult(&result); - } - - /* No key at all in command? then we can serve the request - * without redirections or errors in all the cases. */ - if (n == NULL) return myself; - - uint64_t cmd_flags = getCommandFlags(c); - /* Cluster is globally down but we got keys? We only serve the request - * if it is a read command and when allow_reads_when_down is enabled. */ - if (server.cluster->state != CLUSTER_OK) { - if (is_pubsubshard) { - if (!server.cluster_allow_pubsubshard_when_down) { - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } - } else if (!server.cluster_allow_reads_when_down) { - /* The cluster is configured to block commands when the - * cluster is down. */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; - return NULL; - } else if (cmd_flags & CMD_WRITE) { - /* The cluster is configured to allow read only commands */ - if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; - return NULL; - } else { - /* Fall through and allow the command to be executed: - * this happens when server.cluster_allow_reads_when_down is - * true and the command is not a write command */ - } - } - - /* Return the hashslot by reference. */ - if (hashslot) *hashslot = slot; - - /* MIGRATE always works in the context of the local node if the slot - * is open (migrating or importing state). We need to be able to freely - * move keys among instances in this case. */ - if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) - return myself; - - /* If we don't have all the keys and we are migrating the slot, send - * an ASK redirection or TRYAGAIN. */ - if (migrating_slot && missing_keys) { - /* If we have keys but we don't have all keys, we return TRYAGAIN */ - if (existing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; - } else { - if (error_code) *error_code = CLUSTER_REDIR_ASK; - return server.cluster->migrating_slots_to[slot]; - } - } - - /* If we are receiving the slot, and the client correctly flagged the - * request as "ASKING", we can serve the request. However if the request - * involves multiple keys and we don't have them all, the only option is - * to send a TRYAGAIN error. */ - if (importing_slot && - (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) - { - if (multiple_keys && missing_keys) { - if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; - return NULL; - } else { - return myself; - } - } - - /* Handle the read-only client case reading from a slave: if this - * node is a slave and the request is about a hash slot our master - * is serving, we can reply without redirection. */ - int is_write_command = (cmd_flags & CMD_WRITE) || - (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); - if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && - !is_write_command && - nodeIsSlave(myself) && - myself->slaveof == n) - { - return myself; - } - - /* Base case: just return the right node. However if this node is not - * myself, set error_code to MOVED since we need to issue a redirection. */ - if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; - return n; -} - -/* Send the client the right redirection code, according to error_code - * that should be set to one of CLUSTER_REDIR_* macros. - * - * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes - * are used, then the node 'n' should not be NULL, but should be the - * node we want to mention in the redirection. Moreover hashslot should - * be set to the hash slot that caused the redirection. */ -void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { - if (error_code == CLUSTER_REDIR_CROSS_SLOT) { - addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); - } else if (error_code == CLUSTER_REDIR_UNSTABLE) { - /* The request spawns multiple keys in the same slot, - * but the slot is not "stable" currently as there is - * a migration or import in progress. */ - addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); - } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down"); - } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { - addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); - } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { - addReplyError(c,"-CLUSTERDOWN Hash slot not served"); - } else if (error_code == CLUSTER_REDIR_MOVED || - error_code == CLUSTER_REDIR_ASK) - { - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - int port = getNodeClientPort(n, shouldReturnTlsInfo()); - addReplyErrorSds(c,sdscatprintf(sdsempty(), - "-%s %d %s:%d", - (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", - hashslot, getPreferredEndpoint(n), port)); - } else { - serverPanic("getNodeByQuery() unknown error."); - } -} - -/* This function is called by the function processing clients incrementally - * to detect timeouts, in order to handle the following case: - * - * 1) A client blocks with BLPOP or similar blocking operation. - * 2) The master migrates the hash slot elsewhere or turns into a slave. - * 3) The client may remain blocked forever (or up to the max timeout time) - * waiting for a key change that will never happen. - * - * If the client is found to be blocked into a hash slot this node no - * longer handles, the client is sent a redirection error, and the function - * returns 1. Otherwise 0 is returned and no operation is performed. */ -int clusterRedirectBlockedClientIfNeeded(client *c) { - if (c->flags & CLIENT_BLOCKED && - (c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || - c->bstate.btype == BLOCKED_MODULE)) - { - dictEntry *de; - dictIterator *di; - - /* If the cluster is down, unblock the client with the right error. - * If the cluster is configured to allow reads on cluster down, we - * still want to emit this error since a write will be required - * to unblock them which may never come. */ - if (server.cluster->state == CLUSTER_FAIL) { - clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); - return 1; - } - - /* If the client is blocked on module, but not on a specific key, - * don't unblock it (except for the CLUSTER_FAIL case above). */ - if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) - return 0; - - /* All keys must belong to the same slot, so check first key only. */ - di = dictGetIterator(c->bstate.keys); - if ((de = dictNext(di)) != NULL) { - robj *key = dictGetKey(de); - int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); - clusterNode *node = server.cluster->slots[slot]; - - /* if the client is read-only and attempting to access key that our - * replica can handle, allow it. */ - if ((c->flags & CLIENT_READONLY) && - !(c->lastcmd->flags & CMD_WRITE) && - nodeIsSlave(myself) && myself->slaveof == node) - { - node = myself; - } - - /* We send an error and unblock the client if: - * 1) The slot is unassigned, emitting a cluster down error. - * 2) The slot is not handled by this node, nor being imported. */ - if (node != myself && - server.cluster->importing_slots_from[slot] == NULL) - { - if (node == NULL) { - clusterRedirectClient(c,NULL,0, - CLUSTER_REDIR_DOWN_UNBOUND); - } else { - clusterRedirectClient(c,node,slot, - CLUSTER_REDIR_MOVED); - } - dictReleaseIterator(di); - return 1; - } - } - dictReleaseIterator(di); - } - return 0; -} - -/* Remove all the keys in the specified hash slot. - * The number of removed items is returned. */ -unsigned int delKeysInSlot(unsigned int hashslot) { - unsigned int j = 0; - - dictIterator *iter = NULL; - dictEntry *de = NULL; - iter = dictGetSafeIterator(server.db->dict[hashslot]); - while((de = dictNext(iter)) != NULL) { - sds sdskey = dictGetKey(de); - robj *key = createStringObject(sdskey, sdslen(sdskey)); - dbDelete(&server.db[0], key); - propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); - signalModifiedKey(NULL, &server.db[0], key); - /* The keys are not actually logically deleted from the database, just moved to another node. - * The modules needs to know that these keys are no longer available locally, so just send the - * keyspace notification to the modules, but not to clients. */ - moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); - postExecutionUnitOperations(); - decrRefCount(key); - j++; - server.dirty++; - } - dictReleaseIterator(iter); - - return j; -} - -unsigned int countKeysInSlot(unsigned int slot) { - return dictSize(server.db->dict[slot]); -} - -/* ----------------------------------------------------------------------------- - * Operation(s) on channel rax tree. - * -------------------------------------------------------------------------- */ - -void slotToChannelUpdate(sds channel, int add) { - size_t keylen = sdslen(channel); - unsigned int hashslot = keyHashSlot(channel,keylen); - unsigned char buf[64]; - unsigned char *indexed = buf; - - if (keylen+2 > 64) indexed = zmalloc(keylen+2); - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - memcpy(indexed+2,channel,keylen); - if (add) { - raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); - } else { - raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); - } - if (indexed != buf) zfree(indexed); -} - -void slotToChannelAdd(sds channel) { - slotToChannelUpdate(channel,1); -} - -void slotToChannelDel(sds channel) { - slotToChannelUpdate(channel,0); -} - -/* Get the count of the channels for a given slot. */ -unsigned int countChannelsInSlot(unsigned int hashslot) { - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - j++; - } - raxStop(&iter); - return j; -} diff --git a/src/cluster.h b/src/cluster.h index 0340349b2..02c5f67f3 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -2,7 +2,7 @@ #define __CLUSTER_H /*----------------------------------------------------------------------------- - * Redis cluster data structures, defines, exported API. + * Redis cluster exported API. *----------------------------------------------------------------------------*/ #define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ @@ -11,15 +11,6 @@ #define CLUSTER_OK 0 /* Everything looks ok */ #define CLUSTER_FAIL 1 /* The cluster can't work */ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ -#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ - -/* The following defines are amount of time, sometimes expressed as - * multiplicators of the node timeout value (when ending with MULT). */ -#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ -#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ -#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ -#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ -#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ @@ -31,77 +22,8 @@ #define CLUSTER_REDIR_DOWN_UNBOUND 6 /* -CLUSTERDOWN, unbound slot. */ #define CLUSTER_REDIR_DOWN_RO_STATE 7 /* -CLUSTERDOWN, allow reads. */ -struct clusterNode; - -/* clusterLink encapsulates everything needed to talk with a remote node. */ -typedef struct clusterLink { - mstime_t ctime; /* Link creation time */ - connection *conn; /* Connection to remote node */ - list *send_msg_queue; /* List of messages to be sent */ - size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */ - unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */ - char *rcvbuf; /* Packet reception buffer */ - size_t rcvbuf_len; /* Used size of rcvbuf */ - size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ - struct clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ - int inbound; /* 1 if this link is an inbound link accepted from the related node */ -} clusterLink; - -/* Cluster node flags and macros. */ -#define CLUSTER_NODE_MASTER 1 /* The node is a master */ -#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ -#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ -#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ -#define CLUSTER_NODE_MYSELF 16 /* This node is myself */ -#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ -#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ -#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ -#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */ -#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ -#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" - -#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER) -#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) -#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) -#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) -#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR) -#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) -#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) -#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) - -/* Reasons why a slave is not able to failover. */ -#define CLUSTER_CANT_FAILOVER_NONE 0 -#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 -#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 -#define CLUSTER_CANT_FAILOVER_EXPIRED 3 -#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 -#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ - -/* clusterState todo_before_sleep flags. */ -#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) -#define CLUSTER_TODO_UPDATE_STATE (1<<1) -#define CLUSTER_TODO_SAVE_CONFIG (1<<2) -#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) -#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) - -/* Message types. - * - * Note that the PING, PONG and MEET messages are actually the same exact - * kind of packet. PONG is the reply to ping, in the exact format as a PING, - * while MEET is a special PING that forces the receiver to add the sender - * as a node (if it is not already in the list). */ -#define CLUSTERMSG_TYPE_PING 0 /* Ping */ -#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ -#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ -#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ -#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ -#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ -#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ -#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ -#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ +typedef struct _clusterNode clusterNode; +struct clusterState; /* Flags that a module can set in order to prevent certain Redis Cluster * features to be enabled. Useful when implementing a different distributed @@ -110,312 +32,87 @@ typedef struct clusterLink { #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1) #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) -/* This structure represent elements of node->fail_reports. */ -typedef struct clusterNodeFailReport { - struct clusterNode *node; /* Node reporting the failure condition. */ - mstime_t time; /* Time of the last report from this node. */ -} clusterNodeFailReport; - -typedef struct clusterNode { - mstime_t ctime; /* Node object creation time. */ - char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ - char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ - int flags; /* CLUSTER_NODE_... */ - uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ - uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ - int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ - int numslots; /* Number of slots handled by this node */ - int numslaves; /* Number of slave nodes, if this is a master */ - struct clusterNode **slaves; /* pointers to slave nodes */ - struct clusterNode *slaveof; /* pointer to the master node. Note that it - may be NULL even if the node is a slave - if we don't have the master node in our - tables. */ - unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ - mstime_t ping_sent; /* Unix time we sent latest ping */ - mstime_t pong_received; /* Unix time we received the pong */ - mstime_t data_received; /* Unix time we received any data */ - mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a slave of this master */ - mstime_t repl_offset_time; /* Unix time we received offset for this node */ - mstime_t orphaned_time; /* Starting time of orphaned master condition */ - long long repl_offset; /* Last known repl offset for this node. */ - char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ - sds hostname; /* The known hostname for this node */ - sds human_nodename; /* The known human readable nodename for this node */ - int tcp_port; /* Latest known clients TCP port. */ - int tls_port; /* Latest known clients TLS port */ - int cport; /* Latest known cluster port of this node. */ - clusterLink *link; /* TCP/IP link established toward this node */ - clusterLink *inbound_link; /* TCP/IP link accepted from this node */ - list *fail_reports; /* List of nodes signaling this as failing */ -} clusterNode; - -typedef struct clusterState { - clusterNode *myself; /* This node */ - uint64_t currentEpoch; - int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ - int size; /* Num of master nodes with at least one slot */ - dict *nodes; /* Hash table of name -> clusterNode structures */ - dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ - dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ - clusterNode *migrating_slots_to[CLUSTER_SLOTS]; - clusterNode *importing_slots_from[CLUSTER_SLOTS]; - clusterNode *slots[CLUSTER_SLOTS]; - rax *slots_to_channels; - /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This slave rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a slave is currently not able to - failover. See the CANT_FAILOVER_* macros. */ - /* Manual failover state in common. */ - mstime_t mf_end; /* Manual failover time limit (ms unixtime). - It is zero if there is no MF in progress. */ - /* Manual failover state of master. */ - clusterNode *mf_slave; /* Slave performing the manual failover. */ - /* Manual failover state of slave. */ - long long mf_master_offset; /* Master offset the slave needs to start MF - or -1 if still not received. */ - int mf_can_start; /* If non-zero signal that the manual failover - can start requesting masters vote. */ - /* The following fields are used by masters to take state on elections. */ - uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ - int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ - /* Stats */ - /* Messages received and sent by type. */ - long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; - long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; - long long stats_pfail_nodes; /* Number of nodes in PFAIL status, - excluding nodes without address. */ - unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ - - /* Bit map for slots that are no longer claimed by the owner in cluster PING - * messages. During slot migration, the owner will stop claiming the slot after - * the ownership transfer. Set the bit corresponding to the slot when a node - * stops claiming the slot. This prevents spreading incorrect information (that - * source still owns the slot) using UPDATE messages. */ - unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; -} clusterState; - -/* Redis cluster messages header */ - -/* Initially we don't know our "name", but we'll find it once we connect - * to the first node, using the getsockname() function. Then we'll use this - * address for all the next messages. */ -typedef struct { - char nodename[CLUSTER_NAMELEN]; - uint32_t ping_sent; - uint32_t pong_received; - char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ - uint16_t port; /* primary port last time it was seen */ - uint16_t cport; /* cluster port last time it was seen */ - uint16_t flags; /* node->flags copy */ - uint16_t pport; /* secondary port last time it was seen */ - uint16_t notused1; -} clusterMsgDataGossip; - -typedef struct { - char nodename[CLUSTER_NAMELEN]; -} clusterMsgDataFail; - -typedef struct { - uint32_t channel_len; - uint32_t message_len; - unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ -} clusterMsgDataPublish; - -typedef struct { - uint64_t configEpoch; /* Config epoch of the specified instance. */ - char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ - unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ -} clusterMsgDataUpdate; - -typedef struct { - uint64_t module_id; /* ID of the sender module. */ - uint32_t len; /* ID of the sender module. */ - uint8_t type; /* Type from 0 to 255. */ - unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ -} clusterMsgModule; - -/* The cluster supports optional extension messages that can be sent - * along with ping/pong/meet messages to give additional info in a - * consistent manner. */ -typedef enum { - CLUSTERMSG_EXT_TYPE_HOSTNAME, - CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, - CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, - CLUSTERMSG_EXT_TYPE_SHARDID, -} clusterMsgPingtypes; - -/* Helper function for making sure extensions are eight byte aligned. */ -#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) - -typedef struct { - char hostname[1]; /* The announced hostname, ends with \0. */ -} clusterMsgPingExtHostname; - -typedef struct { - char human_nodename[1]; /* The announced nodename, ends with \0. */ -} clusterMsgPingExtHumanNodename; - -typedef struct { - char name[CLUSTER_NAMELEN]; /* Node name. */ - uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ -} clusterMsgPingExtForgottenNode; - -static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); - -typedef struct { - char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ -} clusterMsgPingExtShardId; - -typedef struct { - uint32_t length; /* Total length of this extension message (including this header) */ - uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ - uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ - union { - clusterMsgPingExtHostname hostname; - clusterMsgPingExtHumanNodename human_nodename; - clusterMsgPingExtForgottenNode forgotten_node; - clusterMsgPingExtShardId shard_id; - } ext[]; /* Actual extension information, formatted so that the data is 8 - * byte aligned, regardless of its content. */ -} clusterMsgPingExt; - -union clusterMsgData { - /* PING, MEET and PONG */ - struct { - /* Array of N clusterMsgDataGossip structures */ - clusterMsgDataGossip gossip[1]; - /* Extension data that can optionally be sent for ping/meet/pong - * messages. We can't explicitly define them here though, since - * the gossip array isn't the real length of the gossip data. */ - } ping; - - /* FAIL */ - struct { - clusterMsgDataFail about; - } fail; - - /* PUBLISH */ - struct { - clusterMsgDataPublish msg; - } publish; - - /* UPDATE */ - struct { - clusterMsgDataUpdate nodecfg; - } update; - - /* MODULE */ - struct { - clusterMsgModule msg; - } module; -}; - -#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ - -typedef struct { - char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ - uint32_t totlen; /* Total length of this message */ - uint16_t ver; /* Protocol version, currently set to 1. */ - uint16_t port; /* Primary port number (TCP or TLS). */ - uint16_t type; /* Message type */ - uint16_t count; /* Only used for some kind of messages. */ - uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ - uint64_t configEpoch; /* The config epoch if it's a master, or the last - epoch advertised by its master if it is a - slave. */ - uint64_t offset; /* Master replication offset if node is a master or - processed replication offset if node is a slave. */ - char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[CLUSTER_SLOTS/8]; - char slaveof[CLUSTER_NAMELEN]; - char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ - uint16_t extensions; /* Number of extensions sent along with this packet. */ - char notused1[30]; /* 30 bytes reserved for future usage. */ - uint16_t pport; /* Secondary port number: if primary port is TCP port, this is - TLS port, and if primary port is TLS port, this is TCP port.*/ - uint16_t cport; /* Sender TCP cluster bus port */ - uint16_t flags; /* Sender node flags */ - unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ - union clusterMsgData data; -} clusterMsg; - -/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster - * members, which can be running different versions of redis-server bits, - * especially during cluster rolling upgrades. - * - * Therefore, fields in this struct should remain at the same offset from - * release to release. The static asserts below ensures that incompatible - * changes in clusterMsg be caught at compile time. - */ - -static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); -static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); -static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); -static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); -static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); -static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); -static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); -static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); -static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); -static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); -static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); -static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); -static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); -static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); -static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); -static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); -static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); -static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); -static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); - -#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) - -/* Message flags better specify the packet content or are used to - * provide some information about the node state. */ -#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ -#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if - master is up. */ -#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ - /* ---------------------- API exported outside cluster.c -------------------- */ +/* functions requiring mechanism specific implementations */ void clusterInit(void); -void clusterInitListeners(void); +void clusterInitLast(void); void clusterCron(void); void clusterBeforeSleep(void); -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); -int verifyClusterNodeId(const char *name, int length); +int verifyClusterConfigWithData(void); + +int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len); + +void clusterUpdateMyselfFlags(void); +void clusterUpdateMyselfIp(void); +void clusterUpdateMyselfHostname(void); +void clusterUpdateMyselfAnnouncedPorts(void); +void clusterUpdateMyselfHumanNodename(void); + +void slotToChannelAdd(sds channel); +void slotToChannelDel(sds channel); +void clusterPropagatePublish(robj *channel, robj *message, int sharded); + +unsigned long getClusterConnectionsCount(void); +int isClusterHealthy(void); + +sds clusterGenNodesDescription(client *c, int filter, int tls_primary); +sds genClusterInfoString(void); +/* handle implementation specific debug cluster commands. Return 1 if handled, 0 otherwise. */ +int handleDebugClusterCommand(client *c); +const char **clusterDebugCommandExtendedHelp(void); +/* handle implementation specific cluster commands. Return 1 if handled, 0 otherwise. */ +int clusterCommandSpecial(client *c); +const char** clusterCommandExtendedHelp(void); + +int clusterAllowFailoverCmd(client *c); +void clusterPromoteSelfToMaster(void); +int clusterManualFailoverTimeLimit(void); + +void clusterCommandSlots(client * c); +void clusterCommandMyId(client *c); +void clusterCommandMyShardId(client *c); +void clusterCommandShards(client *c); +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); + +int clusterNodeCoversSlot(clusterNode *n, int slot); +int getNodeDefaultClientPort(clusterNode *n); +int clusterNodeIsMyself(clusterNode *n); +clusterNode *getMyClusterNode(void); +char *getMyClusterId(void); +int getClusterSize(void); +int handleDebugClusterCommand(client *c); +int clusterNodePending(clusterNode *node); +int clusterNodeIsMaster(clusterNode *n); +char **getClusterNodesList(size_t *numnodes); +int clusterNodeIsMaster(clusterNode *n); +char *clusterNodeIp(clusterNode *node); +int clusterNodeIsSlave(clusterNode *node); +clusterNode *clusterNodeGetSlaveof(clusterNode *node); +char *clusterNodeGetName(clusterNode *node); +int clusterNodeTimedOut(clusterNode *node); +int clusterNodeIsFailing(clusterNode *node); +int clusterNodeIsNoFailover(clusterNode *node); +char *clusterNodeGetShardId(clusterNode *node); +int clusterNodeNumSlaves(clusterNode *node); +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx); +clusterNode *getMigratingSlotDest(int slot); +clusterNode *getImportingSlotSource(int slot); +clusterNode *getNodeBySlot(int slot); +int clusterNodeClientPort(clusterNode *n, int use_tls); +char *clusterNodeHostname(clusterNode *node); +const char *clusterNodePreferredEndpoint(clusterNode *n); +long long clusterNodeReplOffset(clusterNode *node); clusterNode *clusterLookupNode(const char *name, int length); + +/* functions with shared implementations */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); int clusterRedirectBlockedClientIfNeeded(client *c); void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); void migrateCloseTimedoutSockets(void); -int verifyClusterConfigWithData(void); -unsigned long getClusterConnectionsCount(void); -int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len); -void clusterPropagatePublish(robj *channel, robj *message, int sharded); unsigned int keyHashSlot(char *key, int keylen); int patternHashSlot(char *pattern, int length); -void clusterUpdateMyselfFlags(void); -void clusterUpdateMyselfIp(void); -void slotToChannelAdd(sds channel); -void slotToChannelDel(sds channel); -void clusterUpdateMyselfHostname(void); -void clusterUpdateMyselfAnnouncedPorts(void); -sds clusterGenNodesDescription(client *c, int filter, int tls_primary); -sds genClusterInfoString(void); -void freeClusterLink(clusterLink *link); -int clusterNodeGetSlotBit(clusterNode *n, int slot); -void clusterUpdateMyselfHumanNodename(void); int isValidAuxString(char *s, unsigned int length); -int getNodeDefaultClientPort(clusterNode *n); - +void migrateCommand(client *c); +void clusterCommand(client *c); +ConnectionType *connTypeOfCluster(void); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c new file mode 100644 index 000000000..1f957c99d --- /dev/null +++ b/src/cluster_legacy.c @@ -0,0 +1,6426 @@ +/* Redis Cluster implementation. + * + * Copyright (c) 2009-2012, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "server.h" +#include "cluster.h" +#include "cluster_legacy.h" +#include "endianconv.h" +#include "connection.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/* A global reference to myself is handy to make code more clear. + * Myself always points to server.cluster->myself, that is, the clusterNode + * that represents this node. */ +clusterNode *myself = NULL; + +clusterNode *createClusterNode(char *nodename, int flags); +void clusterAddNode(clusterNode *node); +void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void clusterReadHandler(connection *conn); +void clusterSendPing(clusterLink *link, int type); +void clusterSendFail(char *nodename); +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); +void clusterUpdateState(void); +int clusterNodeCoversSlot(clusterNode *n, int slot); +list *clusterGetNodesInMyShard(clusterNode *node); +int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); +int clusterAddSlot(clusterNode *n, int slot); +int clusterDelSlot(int slot); +int clusterDelNodeSlots(clusterNode *node); +int clusterNodeSetSlotBit(clusterNode *n, int slot); +void clusterSetMaster(clusterNode *n); +void clusterHandleSlaveFailover(void); +void clusterHandleSlaveMigration(int max_slaves); +int bitmapTestBit(unsigned char *bitmap, int pos); +void bitmapSetBit(unsigned char *bitmap, int pos); +void bitmapClearBit(unsigned char *bitmap, int pos); +void clusterDoBeforeSleep(int flags); +void clusterSendUpdate(clusterLink *link, clusterNode *node); +void resetManualFailover(void); +void clusterCloseAllSlots(void); +void clusterSetNodeAsMaster(clusterNode *n); +void clusterDelNode(clusterNode *delnode); +sds representClusterNodeFlags(sds ci, uint16_t flags); +sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count); +void clusterFreeNodesSlotsInfo(clusterNode *n); +uint64_t clusterGetMaxEpoch(void); +int clusterBumpConfigEpochWithoutConsensus(void); +void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); +const char *clusterGetMessageTypeString(int type); +void removeChannelsInSlot(unsigned int slot); +unsigned int countKeysInSlot(unsigned int hashslot); +unsigned int countChannelsInSlot(unsigned int hashslot); +unsigned int delKeysInSlot(unsigned int hashslot); +void clusterAddNodeToShard(const char *shard_id, clusterNode *node); +list *clusterLookupNodeListByShardId(const char *shard_id); +void clusterRemoveNodeFromShard(clusterNode *node); +int auxShardIdSetter(clusterNode *n, void *value, int length); +sds auxShardIdGetter(clusterNode *n, sds s); +int auxShardIdPresent(clusterNode *n); +int auxHumanNodenameSetter(clusterNode *n, void *value, int length); +sds auxHumanNodenameGetter(clusterNode *n, sds s); +int auxHumanNodenamePresent(clusterNode *n); +int auxTcpPortSetter(clusterNode *n, void *value, int length); +sds auxTcpPortGetter(clusterNode *n, sds s); +int auxTcpPortPresent(clusterNode *n); +int auxTlsPortSetter(clusterNode *n, void *value, int length); +sds auxTlsPortGetter(clusterNode *n, sds s); +int auxTlsPortPresent(clusterNode *n); +static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); +void freeClusterLink(clusterLink *link); +int verifyClusterNodeId(const char *name, int length); + +int getNodeDefaultClientPort(clusterNode *n) { + return server.tls_cluster ? n->tls_port : n->tcp_port; +} + +static inline int getNodeDefaultReplicationPort(clusterNode *n) { + return server.tls_replication ? n->tls_port : n->tcp_port; +} + +int clusterNodeClientPort(clusterNode *n, int use_tls) { + return use_tls ? n->tls_port : n->tcp_port; +} + +static inline int defaultClientPort(void) { + return server.tls_cluster ? server.tls_port : server.port; +} + +#define isSlotUnclaimed(slot) \ + (server.cluster->slots[slot] == NULL || \ + bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) + +#define RCVBUF_INIT_LEN 1024 +#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */ + +/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to + * clusterNode structures. */ +dictType clusterNodesDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Cluster re-addition blacklist. This maps node IDs to the time + * we can re-add this node. The goal is to avoid reading a removed + * node for some time. */ +dictType clusterNodesBlackListDictType = { + dictSdsCaseHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCaseCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Cluster shards hash table, mapping shard id to list of nodes */ +dictType clusterSdsToListType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + dictListDestructor, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Aux fields are introduced in Redis 7.2 to support the persistence + * of various important node properties, such as shard id, in nodes.conf. + * Aux fields take an explicit format of name=value pairs and have no + * intrinsic order among them. Aux fields are always grouped together + * at the end of the second column of each row after the node's IP + * address/port/cluster_port and the optional hostname. Aux fields + * are separated by ','. */ + +/* Aux field setter function prototype + * return C_OK when the update is successful; C_ERR otherwise */ +typedef int (aux_value_setter) (clusterNode* n, void *value, int length); +/* Aux field getter function prototype + * return an sds that is a concatenation of the input sds string and + * the aux value */ +typedef sds (aux_value_getter) (clusterNode* n, sds s); + +typedef int (aux_value_present) (clusterNode* n); + +typedef struct { + char *field; + aux_value_setter *setter; + aux_value_getter *getter; + aux_value_present *isPresent; +} auxFieldHandler; + +/* Assign index to each aux field */ +typedef enum { + af_shard_id, + af_human_nodename, + af_tcp_port, + af_tls_port, + af_count, +} auxFieldIndex; + +/* Note that + * 1. the order of the elements below must match that of their + * indices as defined in auxFieldIndex + * 2. aux name can contain characters that pass the isValidAuxChar check only */ +auxFieldHandler auxFieldHandlers[] = { + {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent}, + {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent}, + {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent}, + {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent}, +}; + +int auxShardIdSetter(clusterNode *n, void *value, int length) { + if (verifyClusterNodeId(value, length) == C_ERR) { + return C_ERR; + } + memcpy(n->shard_id, value, CLUSTER_NAMELEN); + /* if n already has replicas, make sure they all agree + * on the shard id */ + for (int i = 0; i < n->numslaves; i++) { + if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { + return C_ERR; + } + } + clusterAddNodeToShard(value, n); + return C_OK; +} + +sds auxShardIdGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%.40s", n->shard_id); +} + +int auxShardIdPresent(clusterNode *n) { + return strlen(n->shard_id); +} + +int auxHumanNodenameSetter(clusterNode *n, void *value, int length) { + if (n && !strncmp(value, n->human_nodename, length)) { + return C_OK; + } else if (!n && (length == 0)) { + return C_OK; + } + if (n) { + n->human_nodename = sdscpylen(n->human_nodename, value, length); + } else if (sdslen(n->human_nodename) != 0) { + sdsclear(n->human_nodename); + } else { + return C_ERR; + } + return C_OK; +} + +sds auxHumanNodenameGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%s", n->human_nodename); +} + +int auxHumanNodenamePresent(clusterNode *n) { + return sdslen(n->human_nodename); +} + +int auxTcpPortSetter(clusterNode *n, void *value, int length) { + if (length > 5 || length < 1) { + return C_ERR; + } + char buf[length + 1]; + memcpy(buf, (char*)value, length); + buf[length] = '\0'; + n->tcp_port = atoi(buf); + return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK; +} + +sds auxTcpPortGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%d", n->tcp_port); +} + +int auxTcpPortPresent(clusterNode *n) { + return n->tcp_port >= 0 && n->tcp_port < 65536; +} + +int auxTlsPortSetter(clusterNode *n, void *value, int length) { + if (length > 5 || length < 1) { + return C_ERR; + } + char buf[length + 1]; + memcpy(buf, (char*)value, length); + buf[length] = '\0'; + n->tls_port = atoi(buf); + return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK; +} + +sds auxTlsPortGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%d", n->tls_port); +} + +int auxTlsPortPresent(clusterNode *n) { + return n->tls_port >= 0 && n->tls_port < 65536; +} + +/* clusterLink send queue blocks */ +typedef struct { + size_t totlen; /* Total length of this block including the message */ + int refcount; /* Number of cluster link send msg queues containing the message */ + clusterMsg msg; +} clusterMsgSendBlock; + +/* ----------------------------------------------------------------------------- + * Initialization + * -------------------------------------------------------------------------- */ + +/* Load the cluster config from 'filename'. + * + * If the file does not exist or is zero-length (this may happen because + * when we lock the nodes.conf file, we create a zero-length one for the + * sake of locking if it does not already exist), C_ERR is returned. + * If the configuration was loaded from the file, C_OK is returned. */ +int clusterLoadConfig(char *filename) { + FILE *fp = fopen(filename,"r"); + struct stat sb; + char *line; + int maxline, j; + + if (fp == NULL) { + if (errno == ENOENT) { + return C_ERR; + } else { + serverLog(LL_WARNING, + "Loading the cluster node config from %s: %s", + filename, strerror(errno)); + exit(1); + } + } + + if (redis_fstat(fileno(fp),&sb) == -1) { + serverLog(LL_WARNING, + "Unable to obtain the cluster node config file stat %s: %s", + filename, strerror(errno)); + exit(1); + } + /* Check if the file is zero-length: if so return C_ERR to signal + * we have to write the config. */ + if (sb.st_size == 0) { + fclose(fp); + return C_ERR; + } + + /* Parse the file. Note that single lines of the cluster config file can + * be really long as they include all the hash slots of the node. + * This means in the worst possible case, half of the Redis slots will be + * present in a single line, possibly in importing or migrating state, so + * together with the node ID of the sender/receiver. + * + * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */ + maxline = 1024+CLUSTER_SLOTS*128; + line = zmalloc(maxline); + while(fgets(line,maxline,fp) != NULL) { + int argc, aux_argc; + sds *argv, *aux_argv; + clusterNode *n, *master; + char *p, *s; + + /* Skip blank lines, they can be created either by users manually + * editing nodes.conf or by the config writing process if stopped + * before the truncate() call. */ + if (line[0] == '\n' || line[0] == '\0') continue; + + /* Split the line into arguments for processing. */ + argv = sdssplitargs(line,&argc); + if (argv == NULL) goto fmterr; + + /* Handle the special "vars" line. Don't pretend it is the last + * line even if it actually is when generated by Redis. */ + if (strcasecmp(argv[0],"vars") == 0) { + if (!(argc % 2)) goto fmterr; + for (j = 1; j < argc; j += 2) { + if (strcasecmp(argv[j],"currentEpoch") == 0) { + server.cluster->currentEpoch = + strtoull(argv[j+1],NULL,10); + } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) { + server.cluster->lastVoteEpoch = + strtoull(argv[j+1],NULL,10); + } else { + serverLog(LL_NOTICE, + "Skipping unknown cluster config variable '%s'", + argv[j]); + } + } + sdsfreesplitres(argv,argc); + continue; + } + + /* Regular config lines have at least eight fields */ + if (argc < 8) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Create this node if it does not exist */ + if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + n = clusterLookupNode(argv[0], sdslen(argv[0])); + if (!n) { + n = createClusterNode(argv[0],0); + clusterAddNode(n); + } + /* Format for the node address and auxiliary argument information: + * ip:port[@cport][,hostname][,aux=val]*] */ + + aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc); + if (aux_argv == NULL) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Hostname is an optional argument that defines the endpoint + * that can be reported to clients instead of IP. */ + if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) { + n->hostname = sdscpy(n->hostname, aux_argv[1]); + } else if (sdslen(n->hostname) != 0) { + sdsclear(n->hostname); + } + + /* All fields after hostname are auxiliary and they take on + * the format of "aux=val" where both aux and val can contain + * characters that pass the isValidAuxChar check only. The order + * of the aux fields is insignificant. */ + int aux_tcp_port = 0; + int aux_tls_port = 0; + for (int i = 2; i < aux_argc; i++) { + int field_argc; + sds *field_argv; + field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc); + if (field_argv == NULL || field_argc != 2) { + /* Invalid aux field format */ + if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Validate that both aux and value contain valid characters only */ + for (unsigned j = 0; j < 2; j++) { + if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){ + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + } + + /* Note that we don't expect lots of aux fields in the foreseeable + * future so a linear search is completely fine. */ + int field_found = 0; + for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) { + if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) || + memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) { + continue; + } + field_found = 1; + aux_tcp_port |= j == af_tcp_port; + aux_tls_port |= j == af_tls_port; + if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) { + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + } + + if (field_found == 0) { + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + + sdsfreesplitres(field_argv, field_argc); + } + /* Address and port */ + if ((p = strrchr(aux_argv[0],':')) == NULL) { + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + *p = '\0'; + memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1); + char *port = p+1; + char *busp = strchr(port,'@'); + if (busp) { + *busp = '\0'; + busp++; + } + /* If neither TCP or TLS port is found in aux field, it is considered + * an old version of nodes.conf file.*/ + if (!aux_tcp_port && !aux_tls_port) { + if (server.tls_cluster) { + n->tls_port = atoi(port); + } else { + n->tcp_port = atoi(port); + } + } else if (!aux_tcp_port) { + n->tcp_port = atoi(port); + } else if (!aux_tls_port) { + n->tls_port = atoi(port); + } + /* In older versions of nodes.conf the "@busport" part is missing. + * In this case we set it to the default offset of 10000 from the + * base port. */ + n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR); + + /* The plaintext port for client in a TLS cluster (n->pport) is not + * stored in nodes.conf. It is received later over the bus protocol. */ + + sdsfreesplitres(aux_argv, aux_argc); + + /* Parse flags */ + p = s = argv[2]; + while(p) { + p = strchr(s,','); + if (p) *p = '\0'; + if (!strcasecmp(s,"myself")) { + serverAssert(server.cluster->myself == NULL); + myself = server.cluster->myself = n; + n->flags |= CLUSTER_NODE_MYSELF; + } else if (!strcasecmp(s,"master")) { + n->flags |= CLUSTER_NODE_MASTER; + } else if (!strcasecmp(s,"slave")) { + n->flags |= CLUSTER_NODE_SLAVE; + } else if (!strcasecmp(s,"fail?")) { + n->flags |= CLUSTER_NODE_PFAIL; + } else if (!strcasecmp(s,"fail")) { + n->flags |= CLUSTER_NODE_FAIL; + n->fail_time = mstime(); + } else if (!strcasecmp(s,"handshake")) { + n->flags |= CLUSTER_NODE_HANDSHAKE; + } else if (!strcasecmp(s,"noaddr")) { + n->flags |= CLUSTER_NODE_NOADDR; + } else if (!strcasecmp(s,"nofailover")) { + n->flags |= CLUSTER_NODE_NOFAILOVER; + } else if (!strcasecmp(s,"noflags")) { + /* nothing to do */ + } else { + serverPanic("Unknown flag in redis cluster config file"); + } + if (p) s = p+1; + } + + /* Get master if any. Set the master and populate master's + * slave list. */ + if (argv[3][0] != '-') { + if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + master = clusterLookupNode(argv[3], sdslen(argv[3])); + if (!master) { + master = createClusterNode(argv[3],0); + clusterAddNode(master); + } + /* shard_id can be absent if we are loading a nodes.conf generated + * by an older version of Redis; we should follow the primary's + * shard_id in this case */ + if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { + memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(master->shard_id, n); + } else if (clusterGetNodesInMyShard(master) != NULL && + memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) + { + /* If the primary has been added to a shard, make sure this + * node has the same persisted shard id as the primary. */ + goto fmterr; + } + n->slaveof = master; + clusterNodeAddSlave(master,n); + } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { + /* n is a primary but it does not have a persisted shard_id. + * This happens if we are loading a nodes.conf generated by + * an older version of Redis. We should manually update the + * shard membership in this case */ + clusterAddNodeToShard(n->shard_id, n); + } + + /* Set ping sent / pong received timestamps */ + if (atoi(argv[4])) n->ping_sent = mstime(); + if (atoi(argv[5])) n->pong_received = mstime(); + + /* Set configEpoch for this node. + * If the node is a replica, set its config epoch to 0. + * If it's a primary, load the config epoch from the configuration file. */ + n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10); + + /* Populate hash slots served by this instance. */ + for (j = 8; j < argc; j++) { + int start, stop; + + if (argv[j][0] == '[') { + /* Here we handle migrating / importing slots */ + int slot; + char direction; + clusterNode *cn; + + p = strchr(argv[j],'-'); + serverAssert(p != NULL); + *p = '\0'; + direction = p[1]; /* Either '>' or '<' */ + slot = atoi(argv[j]+1); + if (slot < 0 || slot >= CLUSTER_SLOTS) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + p += 3; + + char *pr = strchr(p, ']'); + size_t node_len = pr - p; + if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + cn = clusterLookupNode(p, CLUSTER_NAMELEN); + if (!cn) { + cn = createClusterNode(p,0); + clusterAddNode(cn); + } + if (direction == '>') { + server.cluster->migrating_slots_to[slot] = cn; + } else { + server.cluster->importing_slots_from[slot] = cn; + } + continue; + } else if ((p = strchr(argv[j],'-')) != NULL) { + *p = '\0'; + start = atoi(argv[j]); + stop = atoi(p+1); + } else { + start = stop = atoi(argv[j]); + } + if (start < 0 || start >= CLUSTER_SLOTS || + stop < 0 || stop >= CLUSTER_SLOTS) + { + sdsfreesplitres(argv,argc); + goto fmterr; + } + while(start <= stop) clusterAddSlot(n, start++); + } + + sdsfreesplitres(argv,argc); + } + /* Config sanity check */ + if (server.cluster->myself == NULL) goto fmterr; + + zfree(line); + fclose(fp); + + serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name); + + /* Something that should never happen: currentEpoch smaller than + * the max epoch found in the nodes configuration. However we handle this + * as some form of protection against manual editing of critical files. */ + if (clusterGetMaxEpoch() > server.cluster->currentEpoch) { + server.cluster->currentEpoch = clusterGetMaxEpoch(); + } + return C_OK; + +fmterr: + serverLog(LL_WARNING, + "Unrecoverable error: corrupted cluster config file \"%s\".", line); + zfree(line); + if (fp) fclose(fp); + exit(1); +} + +/* Cluster node configuration is exactly the same as CLUSTER NODES output. + * + * This function writes the node config and returns 0, on error -1 + * is returned. + * + * Note: we need to write the file in an atomic way from the point of view + * of the POSIX filesystem semantics, so that if the server is stopped + * or crashes during the write, we'll end with either the old file or the + * new one. Since we have the full payload to write available we can use + * a single write to write the whole file. If the pre-existing file was + * bigger we pad our payload with newlines that are anyway ignored and truncate + * the file afterward. */ +int clusterSaveConfig(int do_fsync) { + sds ci,tmpfilename; + size_t content_size,offset = 0; + ssize_t written_bytes; + int fd = -1; + int retval = C_ERR; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; + + /* Get the nodes description and concatenate our "vars" directive to + * save currentEpoch and lastVoteEpoch. */ + ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0); + ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n", + (unsigned long long) server.cluster->currentEpoch, + (unsigned long long) server.cluster->lastVoteEpoch); + content_size = sdslen(ci); + + /* Create a temp file with the new content. */ + tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I", + server.cluster_configfile,(int) getpid(),mstime()); + if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { + serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno)); + goto cleanup; + } + + while (offset < content_size) { + written_bytes = write(fd,ci + offset,content_size - offset); + if (written_bytes <= 0) { + if (errno == EINTR) continue; + serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s", + offset,strerror(errno)); + goto cleanup; + } + offset += written_bytes; + } + + if (do_fsync) { + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; + if (redis_fsync(fd) == -1) { + serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno)); + goto cleanup; + } + } + + if (rename(tmpfilename, server.cluster_configfile) == -1) { + serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno)); + goto cleanup; + } + + if (do_fsync) { + if (fsyncFileDir(server.cluster_configfile) == -1) { + serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno)); + goto cleanup; + } + } + retval = C_OK; /* If we reached this point, everything is fine. */ + +cleanup: + if (fd != -1) close(fd); + if (retval) unlink(tmpfilename); + sdsfree(tmpfilename); + sdsfree(ci); + return retval; +} + +void clusterSaveConfigOrDie(int do_fsync) { + if (clusterSaveConfig(do_fsync) == -1) { + serverLog(LL_WARNING,"Fatal: can't update cluster config file."); + exit(1); + } +} + +/* Lock the cluster config using flock(), and retain the file descriptor used to + * acquire the lock so that the file will be locked as long as the process is up. + * + * This works because we always update nodes.conf with a new version + * in-place, reopening the file, and writing to it in place (later adjusting + * the length with ftruncate()). + * + * On success C_OK is returned, otherwise an error is logged and + * the function returns C_ERR to signal a lock was not acquired. */ +int clusterLockConfig(char *filename) { +/* flock() does not exist on Solaris + * and a fcntl-based solution won't help, as we constantly re-open that file, + * which will release _all_ locks anyway + */ +#if !defined(__sun) + /* To lock it, we need to open the file in a way it is created if + * it does not exist, otherwise there is a race condition with other + * processes. */ + int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644); + if (fd == -1) { + serverLog(LL_WARNING, + "Can't open %s in order to acquire a lock: %s", + filename, strerror(errno)); + return C_ERR; + } + + if (flock(fd,LOCK_EX|LOCK_NB) == -1) { + if (errno == EWOULDBLOCK) { + serverLog(LL_WARNING, + "Sorry, the cluster configuration file %s is already used " + "by a different Redis Cluster node. Please make sure that " + "different nodes use different cluster configuration " + "files.", filename); + } else { + serverLog(LL_WARNING, + "Impossible to lock %s: %s", filename, strerror(errno)); + } + close(fd); + return C_ERR; + } + /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that + * we'll retain the lock to the file as long as the process exists. + * + * After fork, the child process will get the fd opened by the parent process, + * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), + * it will be closed in the child process. + * If it is not closed, when the main process is killed -9, but the child process + * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the + * child process, and the main process will fail to get lock, means fail to start. */ + server.cluster_config_file_lock_fd = fd; +#else + UNUSED(filename); +#endif /* __sun */ + + return C_OK; +} + +/* Derives our ports to be announced in the cluster bus. */ +void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port, + int *announced_cport) { + /* Config overriding announced ports. */ + *announced_tcp_port = server.cluster_announce_port ? + server.cluster_announce_port : server.port; + *announced_tls_port = server.cluster_announce_tls_port ? + server.cluster_announce_tls_port : server.tls_port; + /* Derive cluster bus port. */ + if (server.cluster_announce_bus_port) { + *announced_cport = server.cluster_announce_bus_port; + } else if (server.cluster_port) { + *announced_cport = server.cluster_port; + } else { + *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR; + } +} + +/* Some flags (currently just the NOFAILOVER flag) may need to be updated + * in the "myself" node based on the current configuration of the node, + * that may change at runtime via CONFIG SET. This function changes the + * set of flags in myself->flags accordingly. */ +void clusterUpdateMyselfFlags(void) { + if (!myself) return; + int oldflags = myself->flags; + int nofailover = server.cluster_slave_no_failover ? + CLUSTER_NODE_NOFAILOVER : 0; + myself->flags &= ~CLUSTER_NODE_NOFAILOVER; + myself->flags |= nofailover; + if (myself->flags != oldflags) { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } +} + + +/* We want to take myself->port/cport/pport in sync with the +* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option. +* The option can be set at runtime via CONFIG SET. */ +void clusterUpdateMyselfAnnouncedPorts(void) { + if (!myself) return; + deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport); +} + +/* We want to take myself->ip in sync with the cluster-announce-ip option. +* The option can be set at runtime via CONFIG SET. */ +void clusterUpdateMyselfIp(void) { + if (!myself) return; + static char *prev_ip = NULL; + char *curr_ip = server.cluster_announce_ip; + int changed = 0; + + if (prev_ip == NULL && curr_ip != NULL) changed = 1; + else if (prev_ip != NULL && curr_ip == NULL) changed = 1; + else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1; + + if (changed) { + if (prev_ip) zfree(prev_ip); + prev_ip = curr_ip; + + if (curr_ip) { + /* We always take a copy of the previous IP address, by + * duplicating the string. This way later we can check if + * the address really changed. */ + prev_ip = zstrdup(prev_ip); + redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN); + } else { + myself->ip[0] = '\0'; /* Force autodetection. */ + } + } +} + +/* Update the hostname for the specified node with the provided C string. */ +static void updateAnnouncedHostname(clusterNode *node, char *new) { + /* Previous and new hostname are the same, no need to update. */ + if (new && !strcmp(new, node->hostname)) { + return; + } else if (!new && (sdslen(node->hostname) == 0)) { + return; + } + + if (new) { + node->hostname = sdscpy(node->hostname, new); + } else if (sdslen(node->hostname) != 0) { + sdsclear(node->hostname); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); +} + +static void updateAnnouncedHumanNodename(clusterNode *node, char *new) { + if (new && !strcmp(new, node->human_nodename)) { + return; + } else if (!new && (sdslen(node->human_nodename) == 0)) { + return; + } + + if (new) { + node->human_nodename = sdscpy(node->human_nodename, new); + } else if (sdslen(node->human_nodename) != 0) { + sdsclear(node->human_nodename); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); +} + + +static void updateShardId(clusterNode *node, const char *shard_id) { + if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { + clusterRemoveNodeFromShard(node); + memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(shard_id, node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + if (shard_id && myself != node && myself->slaveof == node) { + if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { + /* shard-id can diverge right after a rolling upgrade + * from pre-7.2 releases */ + clusterRemoveNodeFromShard(myself); + memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(shard_id, myself); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); + } + } +} + +/* Update my hostname based on server configuration values */ +void clusterUpdateMyselfHostname(void) { + if (!myself) return; + updateAnnouncedHostname(myself, server.cluster_announce_hostname); +} + +void clusterUpdateMyselfHumanNodename(void) { + if (!myself) return; + updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename); +} + +void clusterInit(void) { + int saveconf = 0; + + server.cluster = zmalloc(sizeof(struct clusterState)); + server.cluster->myself = NULL; + server.cluster->currentEpoch = 0; + server.cluster->state = CLUSTER_FAIL; + server.cluster->size = 1; + server.cluster->todo_before_sleep = 0; + server.cluster->nodes = dictCreate(&clusterNodesDictType); + server.cluster->shards = dictCreate(&clusterSdsToListType); + server.cluster->nodes_black_list = + dictCreate(&clusterNodesBlackListDictType); + server.cluster->failover_auth_time = 0; + server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_rank = 0; + server.cluster->failover_auth_epoch = 0; + server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; + server.cluster->lastVoteEpoch = 0; + + /* Initialize stats */ + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + server.cluster->stats_bus_messages_sent[i] = 0; + server.cluster->stats_bus_messages_received[i] = 0; + } + server.cluster->stats_pfail_nodes = 0; + server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; + + memset(server.cluster->slots,0, sizeof(server.cluster->slots)); + clusterCloseAllSlots(); + + memset(server.cluster->owner_not_claiming_slot, 0, sizeof(server.cluster->owner_not_claiming_slot)); + + /* Lock the cluster config file to make sure every node uses + * its own nodes.conf. */ + server.cluster_config_file_lock_fd = -1; + if (clusterLockConfig(server.cluster_configfile) == C_ERR) + exit(1); + + /* Load or create a new nodes configuration. */ + if (clusterLoadConfig(server.cluster_configfile) == C_ERR) { + /* No configuration found. We will just use the random name provided + * by the createClusterNode() function. */ + myself = server.cluster->myself = + createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER); + serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s", + myself->name); + clusterAddNode(myself); + clusterAddNodeToShard(myself->shard_id, myself); + saveconf = 1; + } + if (saveconf) clusterSaveConfigOrDie(1); + + /* Port sanity check II + * The other handshake port check is triggered too late to stop + * us from trying to use a too-high cluster port number. */ + int port = defaultClientPort(); + if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) { + serverLog(LL_WARNING, "Redis port number too high. " + "Cluster communication port is 10,000 port " + "numbers higher than your Redis port. " + "Your Redis port number must be 55535 or less."); + exit(1); + } + if (!server.bindaddr_count) { + serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus."); + exit(1); + } + + /* The slots -> channels map is a radix tree. Initialize it here. */ + server.cluster->slots_to_channels = raxNew(); + + /* Set myself->port/cport/pport to my listening ports, we'll just need to + * discover the IP address via MEET messages. */ + deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); + + server.cluster->mf_end = 0; + server.cluster->mf_slave = NULL; + resetManualFailover(); + clusterUpdateMyselfFlags(); + clusterUpdateMyselfIp(); + clusterUpdateMyselfHostname(); + clusterUpdateMyselfHumanNodename(); +} + +void clusterInitLast(void) { + if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) { + serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL)); + exit(1); + } + + int port = defaultClientPort(); + connListener *listener = &server.clistener; + listener->count = 0; + listener->bindaddr = server.bindaddr; + listener->bindaddr_count = server.bindaddr_count; + listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR; + listener->ct = connTypeOfCluster(); + if (connListen(listener) == C_ERR ) { + /* Note: the following log text is matched by the test suite. */ + serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port); + exit(1); + } + + if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) { + serverPanic("Unrecoverable error creating Redis Cluster socket accept handler."); + } +} + +/* Reset a node performing a soft or hard reset: + * + * 1) All other nodes are forgotten. + * 2) All the assigned / open slots are released. + * 3) If the node is a slave, it turns into a master. + * 4) Only for hard reset: a new Node ID is generated. + * 5) Only for hard reset: currentEpoch and configEpoch are set to 0. + * 6) The new configuration is saved and the cluster state updated. + * 7) If the node was a slave, the whole data set is flushed away. */ +void clusterReset(int hard) { + dictIterator *di; + dictEntry *de; + int j; + + /* Turn into master. */ + if (nodeIsSlave(myself)) { + clusterSetNodeAsMaster(myself); + replicationUnsetMaster(); + emptyData(-1,EMPTYDB_NO_FLAGS,NULL); + } + + /* Close slots, reset manual failover state. */ + clusterCloseAllSlots(); + resetManualFailover(); + + /* Unassign all the slots. */ + for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j); + + /* Recreate shards dict */ + dictEmpty(server.cluster->shards, NULL); + + /* Forget all the nodes, but myself. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node == myself) continue; + clusterDelNode(node); + } + dictReleaseIterator(di); + + /* Empty the nodes blacklist. */ + dictEmpty(server.cluster->nodes_black_list, NULL); + + /* Hard reset only: set epochs to 0, change node ID. */ + if (hard) { + sds oldname; + + server.cluster->currentEpoch = 0; + server.cluster->lastVoteEpoch = 0; + myself->configEpoch = 0; + serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD"); + + /* To change the Node ID we need to remove the old name from the + * nodes table, change the ID, and re-add back with new name. */ + oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN); + dictDelete(server.cluster->nodes,oldname); + sdsfree(oldname); + getRandomHexChars(myself->name, CLUSTER_NAMELEN); + getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN); + clusterAddNode(myself); + serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name); + } + + /* Re-populate shards */ + clusterAddNodeToShard(myself->shard_id, myself); + + /* Make sure to persist the new config and update the state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER communication link + * -------------------------------------------------------------------------- */ +static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { + uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg); + clusterMsgSendBlock *msgblock = zcalloc(blocklen); + msgblock->refcount = 1; + msgblock->totlen = blocklen; + server.stat_cluster_links_memory += blocklen; + clusterBuildMessageHdr(&msgblock->msg,type,msglen); + return msgblock; +} + +static void clusterMsgSendBlockDecrRefCount(void *node) { + clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node; + msgblock->refcount--; + serverAssert(msgblock->refcount >= 0); + if (msgblock->refcount == 0) { + server.stat_cluster_links_memory -= msgblock->totlen; + zfree(msgblock); + } +} + +clusterLink *createClusterLink(clusterNode *node) { + clusterLink *link = zmalloc(sizeof(*link)); + link->ctime = mstime(); + link->send_msg_queue = listCreate(); + listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount); + link->head_msg_send_offset = 0; + link->send_msg_queue_mem = sizeof(list); + link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); + link->rcvbuf_len = 0; + server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem; + link->conn = NULL; + link->node = node; + /* Related node can only possibly be known at link creation time if this is an outbound link */ + link->inbound = (node == NULL); + if (!link->inbound) { + node->link = link; + } + return link; +} + +/* Free a cluster link, but does not free the associated node of course. + * This function will just make sure that the original node associated + * with this link will have the 'link' field set to NULL. */ +void freeClusterLink(clusterLink *link) { + if (link->conn) { + connClose(link->conn); + link->conn = NULL; + } + server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode); + listRelease(link->send_msg_queue); + server.stat_cluster_links_memory -= link->rcvbuf_alloc; + zfree(link->rcvbuf); + if (link->node) { + if (link->node->link == link) { + serverAssert(!link->inbound); + link->node->link = NULL; + } else if (link->node->inbound_link == link) { + serverAssert(link->inbound); + link->node->inbound_link = NULL; + } + } + zfree(link); +} + +void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { + serverAssert(!link->node); + serverAssert(link->inbound); + if (node->inbound_link) { + /* A peer may disconnect and then reconnect with us, and it's not guaranteed that + * we would always process the disconnection of the existing inbound link before + * accepting a new existing inbound link. Therefore, it's possible to have more than + * one inbound link from the same node at the same time. Our cleanup logic assumes + * a one to one relationship between nodes and inbound links, so we need to kill + * one of the links. The existing link is more likely the outdated one, but it's + * possible the other node may need to open another link. */ + serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", + node->inbound_link->conn->fd, node->name, link->conn->fd); + freeClusterLink(node->inbound_link); + } + serverAssert(!node->inbound_link); + node->inbound_link = link; + link->node = node; +} + +static void clusterConnAcceptHandler(connection *conn) { + clusterLink *link; + + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", connGetLastError(conn)); + connClose(conn); + return; + } + + /* Create a link object we use to handle the connection. + * It gets passed to the readable handler when data is available. + * Initially the link->node pointer is set to NULL as we don't know + * which node is, but the right node is references once we know the + * node identity. */ + link = createClusterLink(NULL); + link->conn = conn; + connSetPrivateData(conn, link); + + /* Register read handler */ + connSetReadHandler(conn, clusterReadHandler); +} + +#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 +void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { + int cport, cfd; + int max = MAX_CLUSTER_ACCEPTS_PER_CALL; + char cip[NET_IP_STR_LEN]; + int require_auth = TLS_CLIENT_AUTH_YES; + UNUSED(el); + UNUSED(mask); + UNUSED(privdata); + + /* If the server is starting up, don't accept cluster connections: + * UPDATE messages may interact with the database content. */ + if (server.masterhost == NULL && server.loading) return; + + while(max--) { + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); + if (cfd == ANET_ERR) { + if (errno != EWOULDBLOCK) + serverLog(LL_VERBOSE, + "Error accepting cluster node: %s", server.neterr); + return; + } + + connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); + + /* Make sure connection is not in an error state */ + if (connGetState(conn) != CONN_STATE_ACCEPTING) { + serverLog(LL_VERBOSE, + "Error creating an accepting connection for cluster node: %s", + connGetLastError(conn)); + connClose(conn); + return; + } + connEnableTcpNoDelay(conn); + connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2); + + /* Use non-blocking I/O for cluster messages. */ + serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport); + + /* Accept the connection now. connAccept() may call our handler directly + * or schedule it for later depending on connection implementation. + */ + if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) { + if (connGetState(conn) == CONN_STATE_ERROR) + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", + connGetLastError(conn)); + connClose(conn); + return; + } + } +} + +/* Return the approximated number of sockets we are using in order to + * take the cluster bus connections. */ +unsigned long getClusterConnectionsCount(void) { + /* We decrement the number of nodes by one, since there is the + * "myself" node too in the list. Each node uses two file descriptors, + * one incoming and one outgoing, thus the multiplication by 2. */ + return server.cluster_enabled ? + ((dictSize(server.cluster->nodes)-1)*2) : 0; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER node API + * -------------------------------------------------------------------------- */ + +/* Create a new cluster node, with the specified flags. + * If "nodename" is NULL this is considered a first handshake and a random + * node name is assigned to this node (it will be fixed later when we'll + * receive the first pong). + * + * The node is created and returned to the user, but it is not automatically + * added to the nodes hash table. */ +clusterNode *createClusterNode(char *nodename, int flags) { + clusterNode *node = zmalloc(sizeof(*node)); + + if (nodename) + memcpy(node->name, nodename, CLUSTER_NAMELEN); + else + getRandomHexChars(node->name, CLUSTER_NAMELEN); + getRandomHexChars(node->shard_id, CLUSTER_NAMELEN); + node->ctime = mstime(); + node->configEpoch = 0; + node->flags = flags; + memset(node->slots,0,sizeof(node->slots)); + node->slot_info_pairs = NULL; + node->slot_info_pairs_count = 0; + node->numslots = 0; + node->numslaves = 0; + node->slaves = NULL; + node->slaveof = NULL; + node->last_in_ping_gossip = 0; + node->ping_sent = node->pong_received = 0; + node->data_received = 0; + node->fail_time = 0; + node->link = NULL; + node->inbound_link = NULL; + memset(node->ip,0,sizeof(node->ip)); + node->hostname = sdsempty(); + node->human_nodename = sdsempty(); + node->tcp_port = 0; + node->cport = 0; + node->tls_port = 0; + node->fail_reports = listCreate(); + node->voted_time = 0; + node->orphaned_time = 0; + node->repl_offset_time = 0; + node->repl_offset = 0; + listSetFreeMethod(node->fail_reports,zfree); + return node; +} + +/* This function is called every time we get a failure report from a node. + * The side effect is to populate the fail_reports list (or to update + * the timestamp of an existing report). + * + * 'failing' is the node that is in failure state according to the + * 'sender' node. + * + * The function returns 0 if it just updates a timestamp of an existing + * failure report from the same sender. 1 is returned if a new failure + * report is created. */ +int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { + list *l = failing->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + + /* If a failure report from the same sender already exists, just update + * the timestamp. */ + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (fr->node == sender) { + fr->time = mstime(); + return 0; + } + } + + /* Otherwise create a new report. */ + fr = zmalloc(sizeof(*fr)); + fr->node = sender; + fr->time = mstime(); + listAddNodeTail(l,fr); + return 1; +} + +/* Remove failure reports that are too old, where too old means reasonably + * older than the global node timeout. Note that anyway for a node to be + * flagged as FAIL we need to have a local PFAIL state that is at least + * older than the global node timeout, so we don't just trust the number + * of failure reports from other nodes. */ +void clusterNodeCleanupFailureReports(clusterNode *node) { + list *l = node->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + mstime_t maxtime = server.cluster_node_timeout * + CLUSTER_FAIL_REPORT_VALIDITY_MULT; + mstime_t now = mstime(); + + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (now - fr->time > maxtime) listDelNode(l,ln); + } +} + +/* Remove the failing report for 'node' if it was previously considered + * failing by 'sender'. This function is called when a node informs us via + * gossip that a node is OK from its point of view (no FAIL or PFAIL flags). + * + * Note that this function is called relatively often as it gets called even + * when there are no nodes failing, and is O(N), however when the cluster is + * fine the failure reports list is empty so the function runs in constant + * time. + * + * The function returns 1 if the failure report was found and removed. + * Otherwise 0 is returned. */ +int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { + list *l = node->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + + /* Search for a failure report from this sender. */ + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (fr->node == sender) break; + } + if (!ln) return 0; /* No failure report from this sender. */ + + /* Remove the failure report. */ + listDelNode(l,ln); + clusterNodeCleanupFailureReports(node); + return 1; +} + +/* Return the number of external nodes that believe 'node' is failing, + * not including this node, that may have a PFAIL or FAIL state for this + * node as well. */ +int clusterNodeFailureReportsCount(clusterNode *node) { + clusterNodeCleanupFailureReports(node); + return listLength(node->fail_reports); +} + +int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) { + int j; + + for (j = 0; j < master->numslaves; j++) { + if (master->slaves[j] == slave) { + if ((j+1) < master->numslaves) { + int remaining_slaves = (master->numslaves - j) - 1; + memmove(master->slaves+j,master->slaves+(j+1), + (sizeof(*master->slaves) * remaining_slaves)); + } + master->numslaves--; + if (master->numslaves == 0) + master->flags &= ~CLUSTER_NODE_MIGRATE_TO; + return C_OK; + } + } + return C_ERR; +} + +int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) { + int j; + + /* If it's already a slave, don't add it again. */ + for (j = 0; j < master->numslaves; j++) + if (master->slaves[j] == slave) return C_ERR; + master->slaves = zrealloc(master->slaves, + sizeof(clusterNode*)*(master->numslaves+1)); + master->slaves[master->numslaves] = slave; + master->numslaves++; + master->flags |= CLUSTER_NODE_MIGRATE_TO; + return C_OK; +} + +int clusterCountNonFailingSlaves(clusterNode *n) { + int j, okslaves = 0; + + for (j = 0; j < n->numslaves; j++) + if (!nodeFailed(n->slaves[j])) okslaves++; + return okslaves; +} + +/* Low level cleanup of the node structure. Only called by clusterDelNode(). */ +void freeClusterNode(clusterNode *n) { + sds nodename; + int j; + + /* If the node has associated slaves, we have to set + * all the slaves->slaveof fields to NULL (unknown). */ + for (j = 0; j < n->numslaves; j++) + n->slaves[j]->slaveof = NULL; + + /* Remove this node from the list of slaves of its master. */ + if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); + + /* Unlink from the set of nodes. */ + nodename = sdsnewlen(n->name, CLUSTER_NAMELEN); + serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK); + sdsfree(nodename); + sdsfree(n->hostname); + sdsfree(n->human_nodename); + + /* Release links and associated data structures. */ + if (n->link) freeClusterLink(n->link); + if (n->inbound_link) freeClusterLink(n->inbound_link); + listRelease(n->fail_reports); + zfree(n->slaves); + zfree(n); +} + +/* Add a node to the nodes hash table */ +void clusterAddNode(clusterNode *node) { + int retval; + + retval = dictAdd(server.cluster->nodes, + sdsnewlen(node->name,CLUSTER_NAMELEN), node); + serverAssert(retval == DICT_OK); +} + +/* Remove a node from the cluster. The function performs the high level + * cleanup, calling freeClusterNode() for the low level cleanup. + * Here we do the following: + * + * 1) Mark all the slots handled by it as unassigned. + * 2) Remove all the failure reports sent by this node and referenced by + * other nodes. + * 3) Remove the node from the owning shard + * 4) Free the node with freeClusterNode() that will in turn remove it + * from the hash table and from the list of slaves of its master, if + * it is a slave node. + */ +void clusterDelNode(clusterNode *delnode) { + int j; + dictIterator *di; + dictEntry *de; + + /* 1) Mark slots as unassigned. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->importing_slots_from[j] == delnode) + server.cluster->importing_slots_from[j] = NULL; + if (server.cluster->migrating_slots_to[j] == delnode) + server.cluster->migrating_slots_to[j] = NULL; + if (server.cluster->slots[j] == delnode) + clusterDelSlot(j); + } + + /* 2) Remove failure reports. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node == delnode) continue; + clusterNodeDelFailureReport(node,delnode); + } + dictReleaseIterator(di); + + /* 3) Remove the node from the owning shard */ + clusterRemoveNodeFromShard(delnode); + + /* 4) Free the node, unlinking it from the cluster. */ + freeClusterNode(delnode); +} + +/* Node lookup by name */ +clusterNode *clusterLookupNode(const char *name, int length) { + if (verifyClusterNodeId(name, length) != C_OK) return NULL; + sds s = sdsnewlen(name, length); + dictEntry *de = dictFind(server.cluster->nodes, s); + sdsfree(s); + if (de == NULL) return NULL; + return dictGetVal(de); +} + +/* Get all the nodes in my shard. + * Note that the list returned is not computed on the fly + * via slaveof; rather, it is maintained permanently to + * track the shard membership and its life cycle is tied + * to this Redis process. Therefore, the caller must not + * release the list. */ +list *clusterGetNodesInMyShard(clusterNode *node) { + sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards,s); + sdsfree(s); + return (de != NULL) ? dictGetVal(de) : NULL; +} + +/* This is only used after the handshake. When we connect a given IP/PORT + * as a result of CLUSTER MEET we don't have the node name yet, so we + * pick a random one, and will fix it when we receive the PONG request using + * this function. */ +void clusterRenameNode(clusterNode *node, char *newname) { + int retval; + sds s = sdsnewlen(node->name, CLUSTER_NAMELEN); + + serverLog(LL_DEBUG,"Renaming node %.40s into %.40s", + node->name, newname); + retval = dictDelete(server.cluster->nodes, s); + sdsfree(s); + serverAssert(retval == DICT_OK); + memcpy(node->name, newname, CLUSTER_NAMELEN); + clusterAddNode(node); +} + +void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards,s); + if (de == NULL) { + list *l = listCreate(); + listAddNodeTail(l, node); + serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); + } else { + list *l = dictGetVal(de); + if (listSearchKey(l, node) == NULL) { + listAddNodeTail(l, node); + } + sdsfree(s); + } +} + +void clusterRemoveNodeFromShard(clusterNode *node) { + sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards, s); + if (de != NULL) { + list *l = dictGetVal(de); + listNode *ln = listSearchKey(l, node); + if (ln != NULL) { + listDelNode(l, ln); + } + if (listLength(l) == 0) { + dictDelete(server.cluster->shards, s); + } + } + sdsfree(s); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER config epoch handling + * -------------------------------------------------------------------------- */ + +/* Return the greatest configEpoch found in the cluster, or the current + * epoch if greater than any node configEpoch. */ +uint64_t clusterGetMaxEpoch(void) { + uint64_t max = 0; + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->configEpoch > max) max = node->configEpoch; + } + dictReleaseIterator(di); + if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; + return max; +} + +/* If this node epoch is zero or is not already the greatest across the + * cluster (from the POV of the local configuration), this function will: + * + * 1) Generate a new config epoch, incrementing the current epoch. + * 2) Assign the new epoch to this node, WITHOUT any consensus. + * 3) Persist the configuration on disk before sending packets with the + * new configuration. + * + * If the new config epoch is generated and assigned, C_OK is returned, + * otherwise C_ERR is returned (since the node has already the greatest + * configuration around) and no operation is performed. + * + * Important note: this function violates the principle that config epochs + * should be generated with consensus and should be unique across the cluster. + * However Redis Cluster uses this auto-generated new config epochs in two + * cases: + * + * 1) When slots are closed after importing. Otherwise resharding would be + * too expensive. + * 2) When CLUSTER FAILOVER is called with options that force a slave to + * failover its master even if there is not master majority able to + * create a new configuration epoch. + * + * Redis Cluster will not explode using this function, even in the case of + * a collision between this node and another node, generating the same + * configuration epoch unilaterally, because the config epoch conflict + * resolution algorithm will eventually move colliding nodes to different + * config epochs. However using this function may violate the "last failover + * wins" rule, so should only be used with care. */ +int clusterBumpConfigEpochWithoutConsensus(void) { + uint64_t maxEpoch = clusterGetMaxEpoch(); + + if (myself->configEpoch == 0 || + myself->configEpoch != maxEpoch) + { + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + serverLog(LL_NOTICE, + "New configEpoch set to %llu", + (unsigned long long) myself->configEpoch); + return C_OK; + } else { + return C_ERR; + } +} + +/* This function is called when this node is a master, and we receive from + * another master a configuration epoch that is equal to our configuration + * epoch. + * + * BACKGROUND + * + * It is not possible that different slaves get the same config + * epoch during a failover election, because the slaves need to get voted + * by a majority. However when we perform a manual resharding of the cluster + * the node will assign a configuration epoch to itself without to ask + * for agreement. Usually resharding happens when the cluster is working well + * and is supervised by the sysadmin, however it is possible for a failover + * to happen exactly while the node we are resharding a slot to assigns itself + * a new configuration epoch, but before it is able to propagate it. + * + * So technically it is possible in this condition that two nodes end with + * the same configuration epoch. + * + * Another possibility is that there are bugs in the implementation causing + * this to happen. + * + * Moreover when a new cluster is created, all the nodes start with the same + * configEpoch. This collision resolution code allows nodes to automatically + * end with a different configEpoch at startup automatically. + * + * In all the cases, we want a mechanism that resolves this issue automatically + * as a safeguard. The same configuration epoch for masters serving different + * set of slots is not harmful, but it is if the nodes end serving the same + * slots for some reason (manual errors or software bugs) without a proper + * failover procedure. + * + * In general we want a system that eventually always ends with different + * masters having different configuration epochs whatever happened, since + * nothing is worse than a split-brain condition in a distributed system. + * + * BEHAVIOR + * + * When this function gets called, what happens is that if this node + * has the lexicographically smaller Node ID compared to the other node + * with the conflicting epoch (the 'sender' node), it will assign itself + * the greatest configuration epoch currently detected among nodes plus 1. + * + * This means that even if there are multiple nodes colliding, the node + * with the greatest Node ID never moves forward, so eventually all the nodes + * end with a different configuration epoch. + */ +void clusterHandleConfigEpochCollision(clusterNode *sender) { + /* Prerequisites: nodes have the same configEpoch and are both masters. */ + if (sender->configEpoch != myself->configEpoch || + !clusterNodeIsMaster(sender) || !clusterNodeIsMaster(myself)) return; + /* Don't act if the colliding node has a smaller Node ID. */ + if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return; + /* Get the next ID available at the best of this node knowledge. */ + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterSaveConfigOrDie(1); + serverLog(LL_VERBOSE, + "WARNING: configEpoch collision with node %.40s (%s)." + " configEpoch set to %llu", + sender->name,sender->human_nodename, + (unsigned long long) myself->configEpoch); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER nodes blacklist + * + * The nodes blacklist is just a way to ensure that a given node with a given + * Node ID is not re-added before some time elapsed (this time is specified + * in seconds in CLUSTER_BLACKLIST_TTL). + * + * This is useful when we want to remove a node from the cluster completely: + * when CLUSTER FORGET is called, it also puts the node into the blacklist so + * that even if we receive gossip messages from other nodes that still remember + * about the node we want to remove, we don't re-add it before some time. + * + * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means + * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes + * in the cluster without dealing with the problem of other nodes re-adding + * back the node to nodes we already sent the FORGET command to. + * + * The data structure used is a hash table with an sds string representing + * the node ID as key, and the time when it is ok to re-add the node as + * value. + * -------------------------------------------------------------------------- */ + +#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */ + + +/* Before of the addNode() or Exists() operations we always remove expired + * entries from the black list. This is an O(N) operation but it is not a + * problem since add / exists operations are called very infrequently and + * the hash table is supposed to contain very little elements at max. + * However without the cleanup during long uptime and with some automated + * node add/removal procedures, entries could accumulate. */ +void clusterBlacklistCleanup(void) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes_black_list); + while((de = dictNext(di)) != NULL) { + int64_t expire = dictGetUnsignedIntegerVal(de); + + if (expire < server.unixtime) + dictDelete(server.cluster->nodes_black_list,dictGetKey(de)); + } + dictReleaseIterator(di); +} + +/* Cleanup the blacklist and add a new node ID to the black list. */ +void clusterBlacklistAddNode(clusterNode *node) { + dictEntry *de; + sds id = sdsnewlen(node->name,CLUSTER_NAMELEN); + + clusterBlacklistCleanup(); + if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) { + /* If the key was added, duplicate the sds string representation of + * the key for the next lookup. We'll free it at the end. */ + id = sdsdup(id); + } + de = dictFind(server.cluster->nodes_black_list,id); + dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL); + sdsfree(id); +} + +/* Return non-zero if the specified node ID exists in the blacklist. + * You don't need to pass an sds string here, any pointer to 40 bytes + * will work. */ +int clusterBlacklistExists(char *nodeid) { + sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN); + int retval; + + clusterBlacklistCleanup(); + retval = dictFind(server.cluster->nodes_black_list,id) != NULL; + sdsfree(id); + return retval; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER messages exchange - PING/PONG and gossip + * -------------------------------------------------------------------------- */ + +/* This function checks if a given node should be marked as FAIL. + * It happens if the following conditions are met: + * + * 1) We received enough failure reports from other master nodes via gossip. + * Enough means that the majority of the masters signaled the node is + * down recently. + * 2) We believe this node is in PFAIL state. + * + * If a failure is detected we also inform the whole cluster about this + * event trying to force every other node to set the FAIL flag for the node. + * + * Note that the form of agreement used here is weak, as we collect the majority + * of masters state during some time, and even if we force agreement by + * propagating the FAIL message, because of partitions we may not reach every + * node. However: + * + * 1) Either we reach the majority and eventually the FAIL state will propagate + * to all the cluster. + * 2) Or there is no majority so no slave promotion will be authorized and the + * FAIL flag will be cleared after some time. + */ +void markNodeAsFailingIfNeeded(clusterNode *node) { + int failures; + int needed_quorum = (server.cluster->size / 2) + 1; + + if (!nodeTimedOut(node)) return; /* We can reach it. */ + if (nodeFailed(node)) return; /* Already FAILing. */ + + failures = clusterNodeFailureReportsCount(node); + /* Also count myself as a voter if I'm a master. */ + if (clusterNodeIsMaster(myself)) failures++; + if (failures < needed_quorum) return; /* No weak agreement from masters. */ + + serverLog(LL_NOTICE, + "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); + + /* Mark the node as failing. */ + node->flags &= ~CLUSTER_NODE_PFAIL; + node->flags |= CLUSTER_NODE_FAIL; + node->fail_time = mstime(); + + /* Broadcast the failing node name to everybody, forcing all the other + * reachable nodes to flag the node as FAIL. + * We do that even if this node is a replica and not a master: anyway + * the failing state is triggered collecting failure reports from masters, + * so here the replica is only helping propagating this status. */ + clusterSendFail(node->name); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); +} + +/* This function is called only if a node is marked as FAIL, but we are able + * to reach it again. It checks if there are the conditions to undo the FAIL + * state. */ +void clearNodeFailureIfNeeded(clusterNode *node) { + mstime_t now = mstime(); + + serverAssert(nodeFailed(node)); + + /* For slaves we always clear the FAIL flag if we can contact the + * node again. */ + if (nodeIsSlave(node) || node->numslots == 0) { + serverLog(LL_NOTICE, + "Clear FAIL state for node %.40s (%s):%s is reachable again.", + node->name,node->human_nodename, + nodeIsSlave(node) ? "replica" : "master without slots"); + node->flags &= ~CLUSTER_NODE_FAIL; + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + } + + /* If it is a master and... + * 1) The FAIL state is old enough. + * 2) It is yet serving slots from our point of view (not failed over). + * Apparently no one is going to fix these slots, clear the FAIL flag. */ + if (clusterNodeIsMaster(node) && node->numslots > 0 && + (now - node->fail_time) > + (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) + { + serverLog(LL_NOTICE, + "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.", + node->name, node->human_nodename); + node->flags &= ~CLUSTER_NODE_FAIL; + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + } +} + +/* Return true if we already have a node in HANDSHAKE state matching the + * specified ip address and port number. This function is used in order to + * avoid adding a new handshake node for the same address multiple times. */ +int clusterHandshakeInProgress(char *ip, int port, int cport) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!nodeInHandshake(node)) continue; + if (!strcasecmp(node->ip,ip) && + getNodeDefaultClientPort(node) == port && + node->cport == cport) break; + } + dictReleaseIterator(di); + return de != NULL; +} + +/* Start a handshake with the specified address if there is not one + * already in progress. Returns non-zero if the handshake was actually + * started. On error zero is returned and errno is set to one of the + * following values: + * + * EAGAIN - There is already a handshake in progress for this address. + * EINVAL - IP or port are not valid. */ +int clusterStartHandshake(char *ip, int port, int cport) { + clusterNode *n; + char norm_ip[NET_IP_STR_LEN]; + struct sockaddr_storage sa; + + /* IP sanity check */ + if (inet_pton(AF_INET,ip, + &(((struct sockaddr_in *)&sa)->sin_addr))) + { + sa.ss_family = AF_INET; + } else if (inet_pton(AF_INET6,ip, + &(((struct sockaddr_in6 *)&sa)->sin6_addr))) + { + sa.ss_family = AF_INET6; + } else { + errno = EINVAL; + return 0; + } + + /* Port sanity check */ + if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) { + errno = EINVAL; + return 0; + } + + /* Set norm_ip as the normalized string representation of the node + * IP address. */ + memset(norm_ip,0,NET_IP_STR_LEN); + if (sa.ss_family == AF_INET) + inet_ntop(AF_INET, + (void*)&(((struct sockaddr_in *)&sa)->sin_addr), + norm_ip,NET_IP_STR_LEN); + else + inet_ntop(AF_INET6, + (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), + norm_ip,NET_IP_STR_LEN); + + if (clusterHandshakeInProgress(norm_ip,port,cport)) { + errno = EAGAIN; + return 0; + } + + /* Add the node with a random address (NULL as first argument to + * createClusterNode()). Everything will be fixed during the + * handshake. */ + n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET); + memcpy(n->ip,norm_ip,sizeof(n->ip)); + if (server.tls_cluster) { + n->tls_port = port; + } else { + n->tcp_port = port; + } + n->cport = cport; + clusterAddNode(n); + return 1; +} + +static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) { + if (server.tls_cluster) { + *tls_port = ntohs(hdr->port); + *tcp_port = ntohs(hdr->pport); + } else { + *tls_port = ntohs(hdr->pport); + *tcp_port = ntohs(hdr->port); + } +} + +static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) { + if (server.tls_cluster) { + *tls_port = ntohs(g->port); + *tcp_port = ntohs(g->pport); + } else { + *tls_port = ntohs(g->pport); + *tcp_port = ntohs(g->port); + } +} + +/* Process the gossip section of PING or PONG packets. + * Note that this function assumes that the packet is already sanity-checked + * by the caller, not in the content of the gossip section, but in the + * length. */ +void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { + uint16_t count = ntohs(hdr->count); + clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip; + clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + + while(count--) { + uint16_t flags = ntohs(g->flags); + clusterNode *node; + sds ci; + + if (server.verbosity == LL_DEBUG) { + ci = representClusterNodeFlags(sdsempty(), flags); + serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s", + g->nodename, + g->ip, + ntohs(g->port), + ntohs(g->cport), + ci); + sdsfree(ci); + } + + /* Convert port and pport into TCP port and TLS port. */ + int msg_tls_port, msg_tcp_port; + getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port); + + /* Update our state accordingly to the gossip sections */ + node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN); + if (node) { + /* We already know this node. + Handle failure reports, only when the sender is a master. */ + if (sender && clusterNodeIsMaster(sender) && node != myself) { + if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) { + if (clusterNodeAddFailureReport(node,sender)) { + serverLog(LL_VERBOSE, + "Node %.40s (%s) reported node %.40s (%s) as not reachable.", + sender->name, sender->human_nodename, node->name, node->human_nodename); + } + markNodeAsFailingIfNeeded(node); + } else { + if (clusterNodeDelFailureReport(node,sender)) { + serverLog(LL_VERBOSE, + "Node %.40s (%s) reported node %.40s (%s) is back online.", + sender->name, sender->human_nodename, node->name, node->human_nodename); + } + } + } + + /* If from our POV the node is up (no failure flags are set), + * we have no pending ping for the node, nor we have failure + * reports for this node, update the last pong time with the + * one we see from the other nodes. */ + if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && + node->ping_sent == 0 && + clusterNodeFailureReportsCount(node) == 0) + { + mstime_t pongtime = ntohl(g->pong_received); + pongtime *= 1000; /* Convert back to milliseconds. */ + + /* Replace the pong time with the received one only if + * it's greater than our view but is not in the future + * (with 500 milliseconds tolerance) from the POV of our + * clock. */ + if (pongtime <= (server.mstime+500) && + pongtime > node->pong_received) + { + node->pong_received = pongtime; + } + } + + /* If we already know this node, but it is not reachable, and + * we see a different address in the gossip section of a node that + * can talk with this other node, update the address, disconnect + * the old link if any, so that we'll attempt to connect with the + * new address. */ + if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && + !(flags & CLUSTER_NODE_NOADDR) && + !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && + (strcasecmp(node->ip,g->ip) || + node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) || + node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) || + node->cport != ntohs(g->cport))) + { + if (node->link) freeClusterLink(node->link); + memcpy(node->ip,g->ip,NET_IP_STR_LEN); + node->tcp_port = msg_tcp_port; + node->tls_port = msg_tls_port; + node->cport = ntohs(g->cport); + node->flags &= ~CLUSTER_NODE_NOADDR; + } + } else { + /* If it's not in NOADDR state and we don't have it, we + * add it to our trusted dict with exact nodeid and flag. + * Note that we cannot simply start a handshake against + * this IP/PORT pairs, since IP/PORT can be reused already, + * otherwise we risk joining another cluster. + * + * Note that we require that the sender of this gossip message + * is a well known node in our cluster, otherwise we risk + * joining another cluster. */ + if (sender && + !(flags & CLUSTER_NODE_NOADDR) && + !clusterBlacklistExists(g->nodename)) + { + clusterNode *node; + node = createClusterNode(g->nodename, flags); + memcpy(node->ip,g->ip,NET_IP_STR_LEN); + node->tcp_port = msg_tcp_port; + node->tls_port = msg_tls_port; + node->cport = ntohs(g->cport); + clusterAddNode(node); + } + } + + /* Next node */ + g++; + } +} + +/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. + * If 'announced_ip' length is non-zero, it is used instead of extracting + * the IP from the socket peer address. */ +int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { + if (announced_ip[0] != '\0') { + memcpy(buf,announced_ip,NET_IP_STR_LEN); + buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ + return C_OK; + } else { + if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) { + serverLog(LL_NOTICE, "Error converting peer IP to string: %s", + link->conn ? connGetLastError(link->conn) : "no link"); + return C_ERR; + } + return C_OK; + } +} + +/* Update the node address to the IP address that can be extracted + * from link->fd, or if hdr->myip is non empty, to the address the node + * is announcing us. The port is taken from the packet header as well. + * + * If the address or port changed, disconnect the node link so that we'll + * connect again to the new address. + * + * If the ip/port pair are already correct no operation is performed at + * all. + * + * The function returns 0 if the node address is still the same, + * otherwise 1 is returned. */ +int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, + clusterMsg *hdr) +{ + char ip[NET_IP_STR_LEN] = {0}; + int cport = ntohs(hdr->cport); + int tcp_port, tls_port; + getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port); + + /* We don't proceed if the link is the same as the sender link, as this + * function is designed to see if the node link is consistent with the + * symmetric link that is used to receive PINGs from the node. + * + * As a side effect this function never frees the passed 'link', so + * it is safe to call during packet processing. */ + if (link == node->link) return 0; + + /* If the peer IP is unavailable for some reasons like invalid fd or closed + * link, just give up the update this time, and the update will be retried + * in the next round of PINGs */ + if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0; + + if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port && + strcmp(ip,node->ip) == 0) return 0; + + /* IP / port is different, update it. */ + memcpy(node->ip,ip,sizeof(ip)); + node->tcp_port = tcp_port; + node->tls_port = tls_port; + node->cport = cport; + if (node->link) freeClusterLink(node->link); + node->flags &= ~CLUSTER_NODE_NOADDR; + serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d", + node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); + + /* Check if this is our master and we have to change the + * replication target as well. */ + if (nodeIsSlave(myself) && myself->slaveof == node) + replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node)); + return 1; +} + +/* Reconfigure the specified node 'n' as a master. This function is called when + * a node that we believed to be a slave is now acting as master in order to + * update the state of the node. */ +void clusterSetNodeAsMaster(clusterNode *n) { + if (clusterNodeIsMaster(n)) return; + + if (n->slaveof) { + clusterNodeRemoveSlave(n->slaveof,n); + if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO; + } + n->flags &= ~CLUSTER_NODE_SLAVE; + n->flags |= CLUSTER_NODE_MASTER; + n->slaveof = NULL; + + /* Update config and state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); +} + +/* This function is called when we receive a master configuration via a + * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the + * node, and the set of slots claimed under this configEpoch. + * + * What we do is to rebind the slots with newer configuration compared to our + * local configuration, and if needed, we turn ourself into a replica of the + * node (see the function comments for more info). + * + * The 'sender' is the node for which we received a configuration update. + * Sometimes it is not actually the "Sender" of the information, like in the + * case we receive the info via an UPDATE packet. */ +void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { + int j; + clusterNode *curmaster = NULL, *newmaster = NULL; + /* The dirty slots list is a list of slots for which we lose the ownership + * while having still keys inside. This usually happens after a failover + * or after a manual cluster reconfiguration operated by the admin. + * + * If the update message is not able to demote a master to slave (in this + * case we'll resync with the master updating the whole key space), we + * need to delete all the keys in the slots we lost ownership. */ + uint16_t dirty_slots[CLUSTER_SLOTS]; + int dirty_slots_count = 0; + + /* We should detect if sender is new master of our shard. + * We will know it if all our slots were migrated to sender, and sender + * has no slots except ours */ + int sender_slots = 0; + int migrated_our_slots = 0; + + /* Here we set curmaster to this node or the node this node + * replicates to if it's a slave. In the for loop we are + * interested to check if slots are taken away from curmaster. */ + curmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; + + if (sender == myself) { + serverLog(LL_NOTICE,"Discarding UPDATE message about myself."); + return; + } + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(slots,j)) { + sender_slots++; + + /* The slot is already bound to the sender of this message. */ + if (server.cluster->slots[j] == sender) { + bitmapClearBit(server.cluster->owner_not_claiming_slot, j); + continue; + } + + /* The slot is in importing state, it should be modified only + * manually via redis-cli (example: a resharding is in progress + * and the migrating side slot was already closed and is advertising + * a new config. We still want the slot to be closed manually). */ + if (server.cluster->importing_slots_from[j]) continue; + + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned or the previous owner no longer owns the slot or + * the new node claims it with a greater configEpoch. + * 2) We are not currently importing the slot. */ + if (isSlotUnclaimed(j) || + server.cluster->slots[j]->configEpoch < senderConfigEpoch) + { + /* Was this slot mine, and still contains keys? Mark it as + * a dirty slot. */ + if (server.cluster->slots[j] == myself && + countKeysInSlot(j) && + sender != myself) + { + dirty_slots[dirty_slots_count] = j; + dirty_slots_count++; + } + + if (server.cluster->slots[j] == curmaster) { + newmaster = sender; + migrated_our_slots++; + } + clusterDelSlot(j); + clusterAddSlot(sender,j); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } + } else if (server.cluster->slots[j] == sender) { + /* The slot is currently bound to the sender but the sender is no longer + * claiming it. We don't want to unbind the slot yet as it can cause the cluster + * to move to FAIL state and also throw client error. Keeping the slot bound to + * the previous owner will cause a few client side redirects, but won't throw + * any errors. We will keep track of the uncertainty in ownership to avoid + * propagating misinformation about this slot's ownership using UPDATE + * messages. */ + bitmapSetBit(server.cluster->owner_not_claiming_slot, j); + } + } + + /* After updating the slots configuration, don't do any actual change + * in the state of the server if a module disabled Redis Cluster + * keys redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return; + + /* If at least one slot was reassigned from a node to another node + * with a greater configEpoch, it is possible that: + * 1) We are a master left without slots. This means that we were + * failed over and we should turn into a replica of the new + * master. + * 2) We are a slave and our master is left without slots. We need + * to replicate to the new slots owner. */ + if (newmaster && curmaster->numslots == 0 && + (server.cluster_allow_replica_migration || + sender_slots == migrated_our_slots)) { + serverLog(LL_NOTICE, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s (%s)", sender->name, sender->human_nodename); + clusterSetMaster(sender); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } else if (myself->slaveof && myself->slaveof->slaveof && + /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it + * can happen that myself is a replica of a replica of myself. If + * this happens, we do nothing to avoid a crash and wait for the + * admin to repair the cluster. */ + myself->slaveof->slaveof != myself) + { + /* Safeguard against sub-replicas. A replica's master can turn itself + * into a replica if its last slot is removed. If no other node takes + * over the slot, there is nothing else to trigger replica migration. */ + serverLog(LL_NOTICE, + "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)", + myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename); + clusterSetMaster(myself->slaveof->slaveof); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } else if (dirty_slots_count) { + /* If we are here, we received an update message which removed + * ownership for certain slots we still have keys about, but still + * we are serving some slots, so this master node was not demoted to + * a slave. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + for (j = 0; j < dirty_slots_count; j++) + delKeysInSlot(dirty_slots[j]); + } +} + +/* Cluster ping extensions. + * + * The ping/pong/meet messages support arbitrary extensions to add additional + * metadata to the messages that are sent between the various nodes in the + * cluster. The extensions take the form: + * [ Header length + type (8 bytes) ] + * [ Extension information (Arbitrary length, but must be 8 byte padded) ] + */ + + +/* Returns the length of a given extension */ +static uint32_t getPingExtLength(clusterMsgPingExt *ext) { + return ntohl(ext->length); +} + +/* Returns the initial position of ping extensions. May return an invalid + * address if there are no ping extensions. */ +static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) { + clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]); + return initial; +} + +/* Given a current ping extension, returns the start of the next extension. May return + * an invalid address if there are no further ping extensions. */ +static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) { + clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext)); + return next; +} + +/* All PING extensions must be 8-byte aligned */ +uint32_t getAlignedPingExtSize(uint32_t dataSize) { + + return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize); +} + +uint32_t getHostnamePingExtSize(void) { + if (sdslen(myself->hostname) == 0) { + return 0; + } + return getAlignedPingExtSize(sdslen(myself->hostname) + 1); +} + +uint32_t getHumanNodenamePingExtSize(void) { + if (sdslen(myself->human_nodename) == 0) { + return 0; + } + return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1); +} + +uint32_t getShardIdPingExtSize(void) { + return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId)); +} + +uint32_t getForgottenNodeExtSize(void) { + return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode)); +} + +void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) { + ext->type = htons(type); + ext->length = htonl(length); + return &ext->ext[0]; +} + +clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) { + return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length)); +} + +/* 1. If a NULL hdr is provided, compute the extension size; + * 2. If a non-NULL hdr is provided, write the hostname ping + * extension at the start of the cursor. This function + * will update the cursor to point to the end of the + * written extension and will return the amount of bytes + * written. */ +uint32_t writePingExt(clusterMsg *hdr, int gossipcount) { + uint16_t extensions = 0; + uint32_t totlen = 0; + clusterMsgPingExt *cursor = NULL; + /* Set the initial extension position */ + if (hdr != NULL) { + cursor = getInitialPingExt(hdr, gossipcount); + } + + /* hostname is optional */ + if (sdslen(myself->hostname) != 0) { + if (cursor != NULL) { + /* Populate hostname */ + clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize()); + memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname)); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + + totlen += getHostnamePingExtSize(); + extensions++; + } + + if (sdslen(myself->human_nodename) != 0) { + if (cursor != NULL) { + /* Populate human_nodename */ + clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize()); + memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename)); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + + totlen += getHumanNodenamePingExtSize(); + extensions++; + } + + /* Gossip forgotten nodes */ + if (dictSize(server.cluster->nodes_black_list) > 0) { + dictIterator *di = dictGetIterator(server.cluster->nodes_black_list); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + if (cursor != NULL) { + uint64_t expire = dictGetUnsignedIntegerVal(de); + if ((time_t)expire < server.unixtime) continue; /* already expired */ + uint64_t ttl = expire - server.unixtime; + clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize()); + memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN); + ext->ttl = htonu64(ttl); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + totlen += getForgottenNodeExtSize(); + extensions++; + } + dictReleaseIterator(di); + } + + /* Populate shard_id */ + if (cursor != NULL) { + clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize()); + memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + totlen += getShardIdPingExtSize(); + extensions++; + + if (hdr != NULL) { + if (extensions != 0) { + hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA; + } + hdr->extensions = htons(extensions); + } + + return totlen; +} + +/* We previously validated the extensions, so this function just needs to + * handle the extensions. */ +void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { + clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + char *ext_hostname = NULL; + char *ext_humannodename = NULL; + char *ext_shardid = NULL; + uint16_t extensions = ntohs(hdr->extensions); + /* Loop through all the extensions and process them */ + clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count)); + while (extensions--) { + uint16_t type = ntohs(ext->type); + if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) { + clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname); + ext_hostname = hostname_ext->hostname; + } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) { + clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename); + ext_humannodename = humannodename_ext->human_nodename; + } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) { + clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node); + clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN); + if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) { + sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); + dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id); + uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); + dictSetUnsignedIntegerVal(de, expire); + clusterDelNode(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + } + } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) { + clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id); + ext_shardid = shardid_ext->shard_id; + } else { + /* Unknown type, we will ignore it but log what happened. */ + serverLog(LL_WARNING, "Received unknown extension type %d", type); + } + + /* We know this will be valid since we validated it ahead of time */ + ext = getNextPingExt(ext); + } + /* If the node did not send us a hostname extension, assume + * they don't have an announced hostname. Otherwise, we'll + * set it now. */ + updateAnnouncedHostname(sender, ext_hostname); + updateAnnouncedHumanNodename(sender, ext_humannodename); + updateShardId(sender, ext_shardid); +} + +static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) { + clusterNode *sender; + if (link->node && !nodeInHandshake(link->node)) { + /* If the link has an associated node, use that so that we don't have to look it + * up every time, except when the node is still in handshake, the node still has + * a random name thus not truly "known". */ + sender = link->node; + } else { + /* Otherwise, fetch sender based on the message */ + sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + /* We know the sender node but haven't associate it with the link. This must + * be an inbound link because only for inbound links we didn't know which node + * to associate when they were created. */ + if (sender && !link->node) { + setClusterNodeToInboundClusterLink(sender, link); + } + } + return sender; +} + +/* When this function is called, there is a packet to process starting + * at link->rcvbuf. Releasing the buffer is up to the caller, so this + * function should just handle the higher level stuff of processing the + * packet, modifying the cluster state if needed. + * + * The function returns 1 if the link is still valid after the packet + * was processed, otherwise 0 if the link was freed since the packet + * processing lead to some inconsistency error (for instance a PONG + * received from the wrong sender ID). */ +int clusterProcessPacket(clusterLink *link) { + clusterMsg *hdr = (clusterMsg*) link->rcvbuf; + uint32_t totlen = ntohl(hdr->totlen); + uint16_t type = ntohs(hdr->type); + mstime_t now = mstime(); + + if (type < CLUSTERMSG_TYPE_COUNT) + server.cluster->stats_bus_messages_received[type]++; + serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes", + clusterGetMessageTypeString(type), (unsigned long) totlen); + + /* Perform sanity checks */ + if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ + if (totlen > link->rcvbuf_len) return 1; + + if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) { + /* Can't handle messages of different versions. */ + return 1; + } + + if (type == server.cluster_drop_packet_filter) { + serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + return 1; + } + + uint16_t flags = ntohs(hdr->flags); + uint16_t extensions = ntohs(hdr->extensions); + uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0; + uint32_t explen; /* expected length of this packet */ + clusterNode *sender; + + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || + type == CLUSTERMSG_TYPE_MEET) + { + uint16_t count = ntohs(hdr->count); + + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += (sizeof(clusterMsgDataGossip)*count); + + /* If there is extension data, which doesn't have a fixed length, + * loop through them and validate the length of it now. */ + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + clusterMsgPingExt *ext = getInitialPingExt(hdr, count); + while (extensions--) { + uint16_t extlen = getPingExtLength(ext); + if (extlen % 8 != 0) { + serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)", + clusterGetMessageTypeString(type), (int) extlen); + return 1; + } + if ((totlen - explen) < extlen) { + serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds " + "total packet length (%lld)", clusterGetMessageTypeString(type), + (unsigned long long) totlen); + return 1; + } + explen += extlen; + ext = getNextPingExt(ext); + } + } + } else if (type == CLUSTERMSG_TYPE_FAIL) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataFail); + } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataPublish) - + 8 + + ntohl(hdr->data.publish.msg.channel_len) + + ntohl(hdr->data.publish.msg.message_len); + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || + type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || + type == CLUSTERMSG_TYPE_MFSTART) + { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataUpdate); + } else if (type == CLUSTERMSG_TYPE_MODULE) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgModule) - + 3 + ntohl(hdr->data.module.msg.len); + } else { + /* We don't know this type of packet, so we assume it's well formed. */ + explen = totlen; + } + + if (totlen != explen) { + serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld", + clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen); + return 1; + } + + sender = getNodeFromLinkAndMsg(link, hdr); + + /* Update the last time we saw any data from this node. We + * use this in order to avoid detecting a timeout from a node that + * is just sending a lot of data in the cluster bus, for instance + * because of Pub/Sub. */ + if (sender) sender->data_received = now; + + if (sender && !nodeInHandshake(sender)) { + /* Update our currentEpoch if we see a newer epoch in the cluster. */ + senderCurrentEpoch = ntohu64(hdr->currentEpoch); + senderConfigEpoch = ntohu64(hdr->configEpoch); + if (senderCurrentEpoch > server.cluster->currentEpoch) + server.cluster->currentEpoch = senderCurrentEpoch; + /* Update the sender configEpoch if it is publishing a newer one. */ + if (senderConfigEpoch > sender->configEpoch) { + sender->configEpoch = senderConfigEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + } + /* Update the replication offset info for this node. */ + sender->repl_offset = ntohu64(hdr->offset); + sender->repl_offset_time = now; + /* If we are a slave performing a manual failover and our master + * sent its offset while already paused, populate the MF state. */ + if (server.cluster->mf_end && + nodeIsSlave(myself) && + myself->slaveof == sender && + hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && + server.cluster->mf_master_offset == -1) + { + server.cluster->mf_master_offset = sender->repl_offset; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); + serverLog(LL_NOTICE, + "Received replication offset for paused " + "master manual failover: %lld", + server.cluster->mf_master_offset); + } + } + + /* Initial processing of PING and MEET requests replying with a PONG. */ + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { + /* We use incoming MEET messages in order to set the address + * for 'myself', since only other cluster nodes will send us + * MEET messages on handshakes, when the cluster joins, or + * later if we changed address, and those nodes will use our + * official address to connect to us. So by obtaining this address + * from the socket is a simple way to discover / update our own + * address in the cluster without it being hardcoded in the config. + * + * However if we don't have an address at all, we update the address + * even with a normal PING packet. If it's wrong it will be fixed + * by MEET later. */ + if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') && + server.cluster_announce_ip == NULL) + { + char ip[NET_IP_STR_LEN]; + + if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 && + strcmp(ip,myself->ip)) + { + memcpy(myself->ip,ip,NET_IP_STR_LEN); + serverLog(LL_NOTICE,"IP address for this node updated to %s", + myself->ip); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + } + + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, slaveof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. */ + if (!sender && type == CLUSTERMSG_TYPE_MEET) { + clusterNode *node; + + node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE); + serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK); + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + + /* If this is a MEET packet from an unknown node, we still process + * the gossip section here since we have to trust the sender because + * of the message type. */ + if (!sender && type == CLUSTERMSG_TYPE_MEET) + clusterProcessGossipSection(hdr,link); + + /* Anyway reply with a PONG */ + clusterSendPing(link,CLUSTERMSG_TYPE_PONG); + } + + /* PING, PONG, MEET: process config information. */ + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || + type == CLUSTERMSG_TYPE_MEET) + { + serverLog(LL_DEBUG,"%s packet received: %.40s", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "NULL"); + if (!link->inbound) { + if (nodeInHandshake(link->node)) { + /* If we already have this node, try to change the + * IP/port of the node with the new one. */ + if (sender) { + serverLog(LL_VERBOSE, + "Handshake: we already know node %.40s (%s), " + "updating the address if needed.", sender->name, sender->human_nodename); + if (nodeUpdateAddressIfNeeded(sender,link,hdr)) + { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + /* Free this node as we already have it. This will + * cause the link to be freed as well. */ + clusterDelNode(link->node); + return 0; + } + + /* First thing to do is replacing the random name with the + * right node name if this was a handshake stage. */ + clusterRenameNode(link->node, hdr->sender); + serverLog(LL_DEBUG,"Handshake with node %.40s completed.", + link->node->name); + link->node->flags &= ~CLUSTER_NODE_HANDSHAKE; + link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } else if (memcmp(link->node->name,hdr->sender, + CLUSTER_NAMELEN) != 0) + { + /* If the reply has a non matching node ID we + * disconnect this node and set it as not having an associated + * address. */ + serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d", + link->node->name, + (int)(now-(link->node->ctime)), + link->node->flags); + link->node->flags |= CLUSTER_NODE_NOADDR; + link->node->ip[0] = '\0'; + link->node->tcp_port = 0; + link->node->tls_port = 0; + link->node->cport = 0; + freeClusterLink(link); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + return 0; + } + } + + /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender + * announced. This is a dynamic flag that we receive from the + * sender, and the latest status must be trusted. We need it to + * be propagated because the slave ranking used to understand the + * delay of each slave in the voting process, needs to know + * what are the instances really competing. */ + if (sender) { + int nofailover = flags & CLUSTER_NODE_NOFAILOVER; + sender->flags &= ~CLUSTER_NODE_NOFAILOVER; + sender->flags |= nofailover; + } + + /* Update the node address if it changed. */ + if (sender && type == CLUSTERMSG_TYPE_PING && + !nodeInHandshake(sender) && + nodeUpdateAddressIfNeeded(sender,link,hdr)) + { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + + /* Update our info about the node */ + if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) { + link->node->pong_received = now; + link->node->ping_sent = 0; + + /* The PFAIL condition can be reversed without external + * help if it is momentary (that is, if it does not + * turn into a FAIL state). + * + * The FAIL condition is also reversible under specific + * conditions detected by clearNodeFailureIfNeeded(). */ + if (nodeTimedOut(link->node)) { + link->node->flags &= ~CLUSTER_NODE_PFAIL; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } else if (nodeFailed(link->node)) { + clearNodeFailureIfNeeded(link->node); + } + } + + /* Check for role switch: slave -> master or master -> slave. */ + if (sender) { + if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME, + sizeof(hdr->slaveof))) + { + /* Node is a master. */ + clusterSetNodeAsMaster(sender); + } else { + /* Node is a slave. */ + clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN); + + if (clusterNodeIsMaster(sender)) { + /* Master turned into a slave! Reconfigure the node. */ + clusterDelNodeSlots(sender); + sender->flags &= ~(CLUSTER_NODE_MASTER| + CLUSTER_NODE_MIGRATE_TO); + sender->flags |= CLUSTER_NODE_SLAVE; + + /* Update config and state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + + /* Master node changed for this slave? */ + if (master && sender->slaveof != master) { + if (sender->slaveof) + clusterNodeRemoveSlave(sender->slaveof,sender); + clusterNodeAddSlave(master,sender); + sender->slaveof = master; + + /* Update config. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + } + } + + /* Update our info about served slots. + * + * Note: this MUST happen after we update the master/slave state + * so that CLUSTER_NODE_MASTER flag will be set. */ + + /* Many checks are only needed if the set of served slots this + * instance claims is different compared to the set of slots we have + * for it. Check this ASAP to avoid other computational expansive + * checks later. */ + clusterNode *sender_master = NULL; /* Sender or its master if slave. */ + int dirty_slots = 0; /* Sender claimed slots don't match my view? */ + + if (sender) { + sender_master = clusterNodeIsMaster(sender) ? sender : sender->slaveof; + if (sender_master) { + dirty_slots = memcmp(sender_master->slots, + hdr->myslots,sizeof(hdr->myslots)) != 0; + } + } + + /* 1) If the sender of the message is a master, and we detected that + * the set of slots it claims changed, scan the slots to see if we + * need to update our configuration. */ + if (sender && clusterNodeIsMaster(sender) && dirty_slots) + clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); + + /* 2) We also check for the reverse condition, that is, the sender + * claims to serve slots we know are served by a master with a + * greater configEpoch. If this happens we inform the sender. + * + * This is useful because sometimes after a partition heals, a + * reappearing master may be the last one to claim a given set of + * hash slots, but with a configuration that other instances know to + * be deprecated. Example: + * + * A and B are master and slave for slots 1,2,3. + * A is partitioned away, B gets promoted. + * B is partitioned away, and A returns available. + * + * Usually B would PING A publishing its set of served slots and its + * configEpoch, but because of the partition B can't inform A of the + * new configuration, so other nodes that have an updated table must + * do it. In this way A will stop to act as a master (or can try to + * failover if there are the conditions to win the election). */ + if (sender && dirty_slots) { + int j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(hdr->myslots,j)) { + if (server.cluster->slots[j] == sender || + isSlotUnclaimed(j)) continue; + if (server.cluster->slots[j]->configEpoch > + senderConfigEpoch) + { + serverLog(LL_VERBOSE, + "Node %.40s has old slots configuration, sending " + "an UPDATE message about %.40s", + sender->name, server.cluster->slots[j]->name); + clusterSendUpdate(sender->link, + server.cluster->slots[j]); + + /* TODO: instead of exiting the loop send every other + * UPDATE packet for other nodes that are the new owner + * of sender's slots. */ + break; + } + } + } + } + + /* If our config epoch collides with the sender's try to fix + * the problem. */ + if (sender && clusterNodeIsMaster(myself) && clusterNodeIsMaster(sender) && + senderConfigEpoch == myself->configEpoch) + { + clusterHandleConfigEpochCollision(sender); + } + + /* Get info from the gossip section */ + if (sender) { + clusterProcessGossipSection(hdr,link); + clusterProcessPingExtensions(hdr,link); + } + } else if (type == CLUSTERMSG_TYPE_FAIL) { + clusterNode *failing; + + if (sender) { + failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN); + if (failing && + !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF))) + { + serverLog(LL_NOTICE, + "FAIL message received from %.40s (%s) about %.40s (%s)", + hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename); + failing->flags |= CLUSTER_NODE_FAIL; + failing->fail_time = now; + failing->flags &= ~CLUSTER_NODE_PFAIL; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + } else { + serverLog(LL_NOTICE, + "Ignoring FAIL message from unknown node %.40s about %.40s", + hdr->sender, hdr->data.fail.about.nodename); + } + } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { + if (!sender) return 1; /* We don't know that node. */ + + robj *channel, *message; + uint32_t channel_len, message_len; + + /* Don't bother creating useless objects if there are no + * Pub/Sub subscribers. */ + if ((type == CLUSTERMSG_TYPE_PUBLISH + && serverPubsubSubscriptionCount() > 0) + || (type == CLUSTERMSG_TYPE_PUBLISHSHARD + && serverPubsubShardSubscriptionCount() > 0)) + { + channel_len = ntohl(hdr->data.publish.msg.channel_len); + message_len = ntohl(hdr->data.publish.msg.message_len); + channel = createStringObject( + (char*)hdr->data.publish.msg.bulk_data,channel_len); + message = createStringObject( + (char*)hdr->data.publish.msg.bulk_data+channel_len, + message_len); + pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD); + decrRefCount(channel); + decrRefCount(message); + } + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { + if (!sender) return 1; /* We don't know that node. */ + clusterSendFailoverAuthIfNeeded(sender,hdr); + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { + if (!sender) return 1; /* We don't know that node. */ + /* We consider this vote only if the sender is a master serving + * a non zero number of slots, and its currentEpoch is greater or + * equal to epoch where this node started the election. */ + if (clusterNodeIsMaster(sender) && sender->numslots > 0 && + senderCurrentEpoch >= server.cluster->failover_auth_epoch) + { + server.cluster->failover_auth_count++; + /* Maybe we reached a quorum here, set a flag to make sure + * we check ASAP. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } + } else if (type == CLUSTERMSG_TYPE_MFSTART) { + /* This message is acceptable only if I'm a master and the sender + * is one of my slaves. */ + if (!sender || sender->slaveof != myself) return 1; + /* Manual failover requested from slaves. Initialize the state + * accordingly. */ + resetManualFailover(); + server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; + server.cluster->mf_slave = sender; + pauseActions(PAUSE_DURING_FAILOVER, + now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), + PAUSE_ACTIONS_CLIENT_WRITE_SET); + serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).", + sender->name, sender->human_nodename); + /* We need to send a ping message to the replica, as it would carry + * `server.cluster->mf_master_offset`, which means the master paused clients + * at offset `server.cluster->mf_master_offset`, so that the replica would + * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as + * to complete failover as quickly as possible. */ + clusterSendPing(link, CLUSTERMSG_TYPE_PING); + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + clusterNode *n; /* The node the update is about. */ + uint64_t reportedConfigEpoch = + ntohu64(hdr->data.update.nodecfg.configEpoch); + + if (!sender) return 1; /* We don't know the sender. */ + n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN); + if (!n) return 1; /* We don't know the reported node. */ + if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */ + + /* If in our current config the node is a slave, set it as a master. */ + if (nodeIsSlave(n)) clusterSetNodeAsMaster(n); + + /* Update the node's configEpoch. */ + n->configEpoch = reportedConfigEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + + /* Check the bitmap of served slots and update our + * config accordingly. */ + clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, + hdr->data.update.nodecfg.slots); + } else if (type == CLUSTERMSG_TYPE_MODULE) { + if (!sender) return 1; /* Protect the module from unknown nodes. */ + /* We need to route this message back to the right module subscribed + * for the right message type. */ + uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */ + uint32_t len = ntohl(hdr->data.module.msg.len); + uint8_t type = hdr->data.module.msg.type; + unsigned char *payload = hdr->data.module.msg.bulk_data; + moduleCallClusterReceivers(sender->name,module_id,type,payload,len); + } else { + serverLog(LL_WARNING,"Received unknown packet type: %d", type); + } + return 1; +} + +/* This function is called when we detect the link with this node is lost. + We set the node as no longer connected. The Cluster Cron will detect + this connection and will try to get it connected again. + + Instead if the node is a temporary node used to accept a query, we + completely free the node on error. */ +void handleLinkIOError(clusterLink *link) { + freeClusterLink(link); +} + +/* Send the messages queued for the link. */ +void clusterWriteHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); + ssize_t nwritten; + size_t totwritten = 0; + + while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { + listNode *head = listFirst(link->send_msg_queue); + clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value; + clusterMsg *msg = &msgblock->msg; + size_t msg_offset = link->head_msg_send_offset; + size_t msg_len = ntohl(msg->totlen); + + nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset); + if (nwritten <= 0) { + serverLog(LL_DEBUG,"I/O error writing to node link: %s", + (nwritten == -1) ? connGetLastError(conn) : "short write"); + handleLinkIOError(link); + return; + } + if (msg_offset + nwritten < msg_len) { + /* If full message wasn't written, record the offset + * and continue sending from this point next time */ + link->head_msg_send_offset += nwritten; + return; + } + serverAssert((msg_offset + nwritten) == msg_len); + link->head_msg_send_offset = 0; + + /* Delete the node and update our memory tracking */ + uint32_t blocklen = msgblock->totlen; + listDelNode(link->send_msg_queue, head); + server.stat_cluster_links_memory -= sizeof(listNode); + link->send_msg_queue_mem -= sizeof(listNode) + blocklen; + + totwritten += nwritten; + } + + if (listLength(link->send_msg_queue) == 0) + connSetWriteHandler(link->conn, NULL); +} + +/* A connect handler that gets called when a connection to another node + * gets established. + */ +void clusterLinkConnectHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); + clusterNode *node = link->node; + + /* Check if connection succeeded */ + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s", + node->name, node->ip, node->cport, + connGetLastError(conn)); + freeClusterLink(link); + return; + } + + /* Register a read handler from now on */ + connSetReadHandler(conn, clusterReadHandler); + + /* Queue a PING in the new connection ASAP: this is crucial + * to avoid false positives in failure detection. + * + * If the node is flagged as MEET, we send a MEET message instead + * of a PING one, to force the receiver to add us in its node + * table. */ + mstime_t old_ping_sent = node->ping_sent; + clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? + CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + if (old_ping_sent) { + /* If there was an active ping before the link was + * disconnected, we want to restore the ping time, otherwise + * replaced by the clusterSendPing() call. */ + node->ping_sent = old_ping_sent; + } + /* We can clear the flag after the first packet is sent. + * If we'll never receive a PONG, we'll never send new packets + * to this node. Instead after the PONG is received and we + * are no longer in meet/handshake status, we want to send + * normal PING packets. */ + node->flags &= ~CLUSTER_NODE_MEET; + + serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", + node->name, node->ip, node->cport); +} + +/* Read data. Try to read the first field of the header first to check the + * full length of the packet. When a whole packet is in memory this function + * will call the function to process the packet. And so forth. */ +void clusterReadHandler(connection *conn) { + clusterMsg buf[1]; + ssize_t nread; + clusterMsg *hdr; + clusterLink *link = connGetPrivateData(conn); + unsigned int readlen, rcvbuflen; + + while(1) { /* Read as long as there is data to read. */ + rcvbuflen = link->rcvbuf_len; + if (rcvbuflen < 8) { + /* First, obtain the first 8 bytes to get the full message + * length. */ + readlen = 8 - rcvbuflen; + } else { + /* Finally read the full message. */ + hdr = (clusterMsg*) link->rcvbuf; + if (rcvbuflen == 8) { + /* Perform some sanity check on the message signature + * and length. */ + if (memcmp(hdr->sig,"RCmb",4) != 0 || + ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) + { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) { + serverLog(LL_WARNING, + "Bad message length or signature received " + "on the Cluster bus."); + } else { + serverLog(LL_WARNING, + "Bad message length or signature received " + "on the Cluster bus from %s:%d", ip, port); + } + handleLinkIOError(link); + return; + } + } + readlen = ntohl(hdr->totlen) - rcvbuflen; + if (readlen > sizeof(buf)) readlen = sizeof(buf); + } + + nread = connRead(conn,buf,readlen); + if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */ + + if (nread <= 0) { + /* I/O error... */ + serverLog(LL_DEBUG,"I/O error reading from node link: %s", + (nread == 0) ? "connection closed" : connGetLastError(conn)); + handleLinkIOError(link); + return; + } else { + /* Read data and recast the pointer to the new buffer. */ + size_t unused = link->rcvbuf_alloc - link->rcvbuf_len; + if ((size_t)nread > unused) { + size_t required = link->rcvbuf_len + nread; + size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; + /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */ + link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC; + link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc); + server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; + } + memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread); + link->rcvbuf_len += nread; + hdr = (clusterMsg*) link->rcvbuf; + rcvbuflen += nread; + } + + /* Total length obtained? Process this packet. */ + if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) { + if (clusterProcessPacket(link)) { + if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) { + size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; + zfree(link->rcvbuf); + link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); + server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; + } + link->rcvbuf_len = 0; + } else { + return; /* Link no longer valid. */ + } + } + } +} + +/* Put the message block into the link's send queue. + * + * It is guaranteed that this function will never have as a side effect + * the link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with the same link later. */ +void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { + if (!link) { + return; + } + if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) + connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); + + listAddNodeTail(link->send_msg_queue, msgblock); + msgblock->refcount++; + + /* Update memory tracking */ + link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen; + server.stat_cluster_links_memory += sizeof(listNode); + + /* Populate sent messages stats. */ + uint16_t type = ntohs(msgblock->msg.type); + if (type < CLUSTERMSG_TYPE_COUNT) + server.cluster->stats_bus_messages_sent[type]++; +} + +/* Send a message to all the nodes that are part of the cluster having + * a connected link. + * + * It is guaranteed that this function will never have as a side effect + * some node->link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with node links later. */ +void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + clusterSendMessage(node->link,msgblock); + } + dictReleaseIterator(di); +} + +/* Build the message header. hdr must point to a buffer at least + * sizeof(clusterMsg) in bytes. */ +static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) { + uint64_t offset; + clusterNode *master; + + /* If this node is a master, we send its slots bitmap and configEpoch. + * If this node is a slave we send the master's information instead (the + * node is flagged as slave so the receiver knows that it is NOT really + * in charge for this slots. */ + master = (nodeIsSlave(myself) && myself->slaveof) ? + myself->slaveof : myself; + + hdr->ver = htons(CLUSTER_PROTO_VER); + hdr->sig[0] = 'R'; + hdr->sig[1] = 'C'; + hdr->sig[2] = 'm'; + hdr->sig[3] = 'b'; + hdr->type = htons(type); + memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN); + + /* If cluster-announce-ip option is enabled, force the receivers of our + * packets to use the specified address for this node. Otherwise if the + * first byte is zero, they'll do auto discovery. */ + memset(hdr->myip,0,NET_IP_STR_LEN); + if (server.cluster_announce_ip) { + redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN); + } + + /* Handle cluster-announce-[tls-|bus-]port. */ + int announced_tcp_port, announced_tls_port, announced_cport; + deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport); + + memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); + memset(hdr->slaveof,0,CLUSTER_NAMELEN); + if (myself->slaveof != NULL) + memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN); + if (server.tls_cluster) { + hdr->port = htons(announced_tls_port); + hdr->pport = htons(announced_tcp_port); + } else { + hdr->port = htons(announced_tcp_port); + hdr->pport = htons(announced_tls_port); + } + hdr->cport = htons(announced_cport); + hdr->flags = htons(myself->flags); + hdr->state = server.cluster->state; + + /* Set the currentEpoch and configEpochs. */ + hdr->currentEpoch = htonu64(server.cluster->currentEpoch); + hdr->configEpoch = htonu64(master->configEpoch); + + /* Set the replication offset. */ + if (nodeIsSlave(myself)) + offset = replicationGetSlaveOffset(); + else + offset = server.master_repl_offset; + hdr->offset = htonu64(offset); + + /* Set the message flags. */ + if (clusterNodeIsMaster(myself) && server.cluster->mf_end) + hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; + + hdr->totlen = htonl(msglen); +} + +/* Set the i-th entry of the gossip section in the message pointed by 'hdr' + * to the info of the specified node 'n'. */ +void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { + clusterMsgDataGossip *gossip; + gossip = &(hdr->data.ping.gossip[i]); + memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN); + gossip->ping_sent = htonl(n->ping_sent/1000); + gossip->pong_received = htonl(n->pong_received/1000); + memcpy(gossip->ip,n->ip,sizeof(n->ip)); + if (server.tls_cluster) { + gossip->port = htons(n->tls_port); + gossip->pport = htons(n->tcp_port); + } else { + gossip->port = htons(n->tcp_port); + gossip->pport = htons(n->tls_port); + } + gossip->cport = htons(n->cport); + gossip->flags = htons(n->flags); + gossip->notused1 = 0; +} + +/* Send a PING or PONG packet to the specified node, making sure to add enough + * gossip information. */ +void clusterSendPing(clusterLink *link, int type) { + static unsigned long long cluster_pings_sent = 0; + cluster_pings_sent++; + int gossipcount = 0; /* Number of gossip sections added so far. */ + int wanted; /* Number of gossip sections we want to append if possible. */ + int estlen; /* Upper bound on estimated packet length */ + /* freshnodes is the max number of nodes we can hope to append at all: + * nodes available minus two (ourself and the node we are sending the + * message to). However practically there may be less valid nodes since + * nodes in handshake state, disconnected, are not considered. */ + int freshnodes = dictSize(server.cluster->nodes)-2; + + /* How many gossip sections we want to add? 1/10 of the number of nodes + * and anyway at least 3. Why 1/10? + * + * If we have N masters, with N/10 entries, and we consider that in + * node_timeout we exchange with each other node at least 4 packets + * (we ping in the worst case in node_timeout/2 time, and we also + * receive two pings from the host), we have a total of 8 packets + * in the node_timeout*2 failure reports validity time. So we have + * that, for a single PFAIL node, we can expect to receive the following + * number of failure reports (in the specified window of time): + * + * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: + * + * PROB = probability of being featured in a single gossip entry, + * which is 1 / NUM_OF_NODES. + * ENTRIES = 10. + * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. + * + * If we assume we have just masters (so num of nodes and num of masters + * is the same), with 1/10 we always get over the majority, and specifically + * 80% of the number of nodes, to account for many masters failing at the + * same time. + * + * Since we have non-voting slaves that lower the probability of an entry + * to feature our node, we set the number of entries per packet as + * 10% of the total nodes we have. */ + wanted = floor(dictSize(server.cluster->nodes)/10); + if (wanted < 3) wanted = 3; + if (wanted > freshnodes) wanted = freshnodes; + + /* Include all the nodes in PFAIL state, so that failure reports are + * faster to propagate to go from PFAIL to FAIL state. */ + int pfail_wanted = server.cluster->stats_pfail_nodes; + + /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen + * later according to the number of gossip sections we really were able + * to put inside the packet. */ + estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData); + estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted)); + estlen += writePingExt(NULL, 0); + /* Note: clusterBuildMessageHdr() expects the buffer to be always at least + * sizeof(clusterMsg) or more. */ + if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); + clusterMsg *hdr = &msgblock->msg; + + if (!link->inbound && type == CLUSTERMSG_TYPE_PING) + link->node->ping_sent = mstime(); + + /* Populate the gossip fields */ + int maxiterations = wanted*3; + while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { + dictEntry *de = dictGetRandomKey(server.cluster->nodes); + clusterNode *this = dictGetVal(de); + + /* Don't include this node: the whole packet header is about us + * already, so we just gossip about other nodes. */ + if (this == myself) continue; + + /* PFAIL nodes will be added later. */ + if (this->flags & CLUSTER_NODE_PFAIL) continue; + + /* In the gossip section don't include: + * 1) Nodes in HANDSHAKE state. + * 3) Nodes with the NOADDR flag set. + * 4) Disconnected nodes if they don't have configured slots. + */ + if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) || + (this->link == NULL && this->numslots == 0)) + { + freshnodes--; /* Technically not correct, but saves CPU. */ + continue; + } + + /* Do not add a node we already have. */ + if (this->last_in_ping_gossip == cluster_pings_sent) continue; + + /* Add it */ + clusterSetGossipEntry(hdr,gossipcount,this); + this->last_in_ping_gossip = cluster_pings_sent; + freshnodes--; + gossipcount++; + } + + /* If there are PFAIL nodes, add them at the end. */ + if (pfail_wanted) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL && pfail_wanted > 0) { + clusterNode *node = dictGetVal(de); + if (node->flags & CLUSTER_NODE_HANDSHAKE) continue; + if (node->flags & CLUSTER_NODE_NOADDR) continue; + if (!(node->flags & CLUSTER_NODE_PFAIL)) continue; + clusterSetGossipEntry(hdr,gossipcount,node); + gossipcount++; + /* We take the count of the slots we allocated, since the + * PFAIL stats may not match perfectly with the current number + * of PFAIL nodes. */ + pfail_wanted--; + } + dictReleaseIterator(di); + } + + /* Compute the actual total length and send! */ + uint32_t totlen = 0; + totlen += writePingExt(hdr, gossipcount); + totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += (sizeof(clusterMsgDataGossip)*gossipcount); + serverAssert(gossipcount < USHRT_MAX); + hdr->count = htons(gossipcount); + hdr->totlen = htonl(totlen); + + clusterSendMessage(link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a PONG packet to every connected node that's not in handshake state + * and for which we have a valid link. + * + * In Redis Cluster pongs are not used just for failure detection, but also + * to carry important configuration information. So broadcasting a pong is + * useful when something changes in the configuration and we want to make + * the cluster aware ASAP (for instance after a slave promotion). + * + * The 'target' argument specifies the receiving instances using the + * defines below: + * + * CLUSTER_BROADCAST_ALL -> All known instances. + * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring. + */ +#define CLUSTER_BROADCAST_ALL 0 +#define CLUSTER_BROADCAST_LOCAL_SLAVES 1 +void clusterBroadcastPong(int target) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!node->link) continue; + if (node == myself || nodeInHandshake(node)) continue; + if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) { + int local_slave = + nodeIsSlave(node) && node->slaveof && + (node->slaveof == myself || node->slaveof == myself->slaveof); + if (!local_slave) continue; + } + clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); + } + dictReleaseIterator(di); +} + +/* Create a PUBLISH message block. + * + * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8. + * As all the struct is used as a buffer, when more than 8 bytes are copied into + * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false + * positive in this context. */ +REDIS_NO_SANITIZE("bounds") +clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) { + + uint32_t channel_len, message_len; + + channel = getDecodedObject(channel); + message = getDecodedObject(message); + channel_len = sdslen(channel->ptr); + message_len = sdslen(message->ptr); + + size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len; + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); + + clusterMsg *hdr = &msgblock->msg; + hdr->data.publish.msg.channel_len = htonl(channel_len); + hdr->data.publish.msg.message_len = htonl(message_len); + memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); + memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), + message->ptr,sdslen(message->ptr)); + + decrRefCount(channel); + decrRefCount(message); + + return msgblock; +} + +/* Send a FAIL message to all the nodes we are able to contact. + * The FAIL message is sent when we detect that a node is failing + * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this: + * we switch the node state to CLUSTER_NODE_FAIL and ask all the other + * nodes to do the same ASAP. */ +void clusterSendFail(char *nodename) { + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + + sizeof(clusterMsgDataFail); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); + + clusterMsg *hdr = &msgblock->msg; + memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN); + + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send an UPDATE message to the specified link carrying the specified 'node' + * slots configuration. The node name, slots bitmap, and configEpoch info + * are included. */ +void clusterSendUpdate(clusterLink *link, clusterNode *node) { + if (link == NULL) return; + + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + + sizeof(clusterMsgDataUpdate); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); + + clusterMsg *hdr = &msgblock->msg; + memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN); + hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); + memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots)); + for (unsigned int i = 0; i < sizeof(node->slots); i++) { + /* Don't advertise slots that the node stopped claiming */ + hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]); + } + + clusterSendMessage(link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a MODULE message. + * + * If link is NULL, then the message is broadcasted to the whole cluster. */ +void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, + const char *payload, uint32_t len) { + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + msglen += sizeof(clusterMsgModule) - 3 + len; + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); + + clusterMsg *hdr = &msgblock->msg; + hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ + hdr->data.module.msg.type = type; + hdr->data.module.msg.len = htonl(len); + memcpy(hdr->data.module.msg.bulk_data,payload,len); + + if (link) + clusterSendMessage(link,msgblock); + else + clusterBroadcastMessage(msgblock); + + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* This function gets a cluster node ID string as target, the same way the nodes + * addresses are represented in the modules side, resolves the node, and sends + * the message. If the target is NULL the message is broadcasted. + * + * The function returns C_OK if the target is valid, otherwise C_ERR is + * returned. */ +int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) { + clusterNode *node = NULL; + + if (target != NULL) { + node = clusterLookupNode(target, strlen(target)); + if (node == NULL || node->link == NULL) return C_ERR; + } + + clusterSendModule(target ? node->link : NULL, + module_id, type, payload, len); + return C_OK; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER Pub/Sub support + * + * If `sharded` is 0: + * For now we do very little, just propagating [S]PUBLISH messages across the whole + * cluster. In the future we'll try to get smarter and avoiding propagating those + * messages to hosts without receives for a given channel. + * Otherwise: + * Publish this message across the slot (primary/replica). + * -------------------------------------------------------------------------- */ +void clusterPropagatePublish(robj *channel, robj *message, int sharded) { + clusterMsgSendBlock *msgblock; + + if (!sharded) { + msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH); + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); + return; + } + + listIter li; + listNode *ln; + list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself); + serverAssert(nodes_for_slot != NULL); + listRewind(nodes_for_slot, &li); + msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD); + while((ln = listNext(&li))) { + clusterNode *node = listNodeValue(ln); + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + clusterSendMessage(node->link,msgblock); + } + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* ----------------------------------------------------------------------------- + * SLAVE node specific functions + * -------------------------------------------------------------------------- */ + +/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to + * see if there is the quorum for this slave instance to failover its failing + * master. + * + * Note that we send the failover request to everybody, master and slave nodes, + * but only the masters are supposed to reply to our query. */ +void clusterRequestFailoverAuth(void) { + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); + + clusterMsg *hdr = &msgblock->msg; + /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit + * in the header to communicate the nodes receiving the message that + * they should authorized the failover even if the master is working. */ + if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a FAILOVER_AUTH_ACK message to the specified node. */ +void clusterSendFailoverAuth(clusterNode *node) { + if (!node->link) return; + + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen); + + clusterSendMessage(node->link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a MFSTART message to the specified node. */ +void clusterSendMFStart(clusterNode *node) { + if (!node->link) return; + + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen); + + clusterSendMessage(node->link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Vote for the node asking for our vote if there are the conditions. */ +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { + clusterNode *master = node->slaveof; + uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch); + uint64_t requestConfigEpoch = ntohu64(request->configEpoch); + unsigned char *claimed_slots = request->myslots; + int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK; + int j; + + /* IF we are not a master serving at least 1 slot, we don't have the + * right to vote, as the cluster size in Redis Cluster is the number + * of masters serving at least one slot, and quorum is the cluster + * size + 1 */ + if (nodeIsSlave(myself) || myself->numslots == 0) return; + + /* Request epoch must be >= our currentEpoch. + * Note that it is impossible for it to actually be greater since + * our currentEpoch was updated as a side effect of receiving this + * request, if the request epoch was greater. */ + if (requestCurrentEpoch < server.cluster->currentEpoch) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", + node->name, node->human_nodename, + (unsigned long long) requestCurrentEpoch, + (unsigned long long) server.cluster->currentEpoch); + return; + } + + /* I already voted for this epoch? Return ASAP. */ + if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): already voted for epoch %llu", + node->name, node->human_nodename, + (unsigned long long) server.cluster->currentEpoch); + return; + } + + /* Node must be a slave and its master down. + * The master can be non failing if the request is flagged + * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ + if (clusterNodeIsMaster(node) || master == NULL || + (!nodeFailed(master) && !force_ack)) + { + if (clusterNodeIsMaster(node)) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): it is a master node", + node->name, node->human_nodename); + } else if (master == NULL) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): I don't know its master", + node->name, node->human_nodename); + } else if (!nodeFailed(master)) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): its master is up", + node->name, node->human_nodename); + } + return; + } + + /* We did not voted for a slave about this master for two + * times the node timeout. This is not strictly needed for correctness + * of the algorithm but makes the base case more linear. */ + if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) + { + serverLog(LL_WARNING, + "Failover auth denied to %.40s %s: " + "can't vote about this master before %lld milliseconds", + node->name, node->human_nodename, + (long long) ((server.cluster_node_timeout*2)- + (mstime() - node->slaveof->voted_time))); + return; + } + + /* The slave requesting the vote must have a configEpoch for the claimed + * slots that is >= the one of the masters currently serving the same + * slots in the current configuration. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(claimed_slots, j) == 0) continue; + if (isSlotUnclaimed(j) || + server.cluster->slots[j]->configEpoch <= requestConfigEpoch) + { + continue; + } + /* If we reached this point we found a slot that in our current slots + * is served by a master with a greater configEpoch than the one claimed + * by the slave requesting our vote. Refuse to vote for this slave. */ + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): " + "slot %d epoch (%llu) > reqEpoch (%llu)", + node->name, node->human_nodename, j, + (unsigned long long) server.cluster->slots[j]->configEpoch, + (unsigned long long) requestConfigEpoch); + return; + } + + /* We can vote for this slave. */ + server.cluster->lastVoteEpoch = server.cluster->currentEpoch; + node->slaveof->voted_time = mstime(); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); + clusterSendFailoverAuth(node); + serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", + node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch); +} + +/* This function returns the "rank" of this instance, a slave, in the context + * of its master-slaves ring. The rank of the slave is given by the number of + * other slaves for the same master that have a better replication offset + * compared to the local one (better means, greater, so they claim more data). + * + * A slave with rank 0 is the one with the greatest (most up to date) + * replication offset, and so forth. Note that because how the rank is computed + * multiple slaves may have the same rank, in case they have the same offset. + * + * The slave rank is used to add a delay to start an election in order to + * get voted and replace a failing master. Slaves with better replication + * offsets are more likely to win. */ +int clusterGetSlaveRank(void) { + long long myoffset; + int j, rank = 0; + clusterNode *master; + + serverAssert(nodeIsSlave(myself)); + master = myself->slaveof; + if (master == NULL) return 0; /* Never called by slaves without master. */ + + myoffset = replicationGetSlaveOffset(); + for (j = 0; j < master->numslaves; j++) + if (master->slaves[j] != myself && + !nodeCantFailover(master->slaves[j]) && + master->slaves[j]->repl_offset > myoffset) rank++; + return rank; +} + +/* This function is called by clusterHandleSlaveFailover() in order to + * let the slave log why it is not able to failover. Sometimes there are + * not the conditions, but since the failover function is called again and + * again, we can't log the same things continuously. + * + * This function works by logging only if a given set of conditions are + * true: + * + * 1) The reason for which the failover can't be initiated changed. + * The reasons also include a NONE reason we reset the state to + * when the slave finds that its master is fine (no FAIL flag). + * 2) Also, the log is emitted again if the master is still down and + * the reason for not failing over is still the same, but more than + * CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed. + * 3) Finally, the function only logs if the slave is down for more than + * five seconds + NODE_TIMEOUT. This way nothing is logged when a + * failover starts in a reasonable time. + * + * The function is called with the reason why the slave can't failover + * which is one of the integer macros CLUSTER_CANT_FAILOVER_*. + * + * The function is guaranteed to be called only if 'myself' is a slave. */ +void clusterLogCantFailover(int reason) { + char *msg; + static time_t lastlog_time = 0; + mstime_t nolog_fail_time = server.cluster_node_timeout + 5000; + + /* Don't log if we have the same reason for some time. */ + if (reason == server.cluster->cant_failover_reason && + time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD) + return; + + server.cluster->cant_failover_reason = reason; + + /* We also don't emit any log if the master failed no long ago, the + * goal of this function is to log slaves in a stalled condition for + * a long time. */ + if (myself->slaveof && + nodeFailed(myself->slaveof) && + (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return; + + switch(reason) { + case CLUSTER_CANT_FAILOVER_DATA_AGE: + msg = "Disconnected from master for longer than allowed. " + "Please check the 'cluster-replica-validity-factor' configuration " + "option."; + break; + case CLUSTER_CANT_FAILOVER_WAITING_DELAY: + msg = "Waiting the delay before I can start a new failover."; + break; + case CLUSTER_CANT_FAILOVER_EXPIRED: + msg = "Failover attempt expired."; + break; + case CLUSTER_CANT_FAILOVER_WAITING_VOTES: + msg = "Waiting for votes, but majority still not reached."; + break; + default: + msg = "Unknown reason code."; + break; + } + lastlog_time = time(NULL); + serverLog(LL_NOTICE,"Currently unable to failover: %s", msg); + + int cur_vote = server.cluster->failover_auth_count; + int cur_quorum = (server.cluster->size / 2) + 1; + /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */ + if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) { + serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote); + } +} + +/* This function implements the final part of automatic and manual failovers, + * where the slave grabs its master's hash slots, and propagates the new + * configuration. + * + * Note that it's up to the caller to be sure that the node got a new + * configuration epoch already. */ +void clusterFailoverReplaceYourMaster(void) { + int j; + clusterNode *oldmaster = myself->slaveof; + + if (clusterNodeIsMaster(myself) || oldmaster == NULL) return; + + /* 1) Turn this node into a master. */ + clusterSetNodeAsMaster(myself); + replicationUnsetMaster(); + + /* 2) Claim all the slots assigned to our master. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (clusterNodeCoversSlot(oldmaster, j)) { + clusterDelSlot(j); + clusterAddSlot(myself,j); + } + } + + /* 3) Update state and save config. */ + clusterUpdateState(); + clusterSaveConfigOrDie(1); + + /* 4) Pong all the other nodes so that they can update the state + * accordingly and detect that we switched to master role. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + + /* 5) If there was a manual failover in progress, clear the state. */ + resetManualFailover(); +} + +/* This function is called if we are a slave node and our master serving + * a non-zero amount of hash slots is in FAIL state. + * + * The goal of this function is: + * 1) To check if we are able to perform a failover, is our data updated? + * 2) Try to get elected by masters. + * 3) Perform the failover informing all the other nodes. + */ +void clusterHandleSlaveFailover(void) { + mstime_t data_age; + mstime_t auth_age = mstime() - server.cluster->failover_auth_time; + int needed_quorum = (server.cluster->size / 2) + 1; + int manual_failover = server.cluster->mf_end != 0 && + server.cluster->mf_can_start; + mstime_t auth_timeout, auth_retry_time; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER; + + /* Compute the failover timeout (the max time we have to send votes + * and wait for replies), and the failover retry time (the time to wait + * before trying to get voted again). + * + * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds. + * Retry is two times the Timeout. + */ + auth_timeout = server.cluster_node_timeout*2; + if (auth_timeout < 2000) auth_timeout = 2000; + auth_retry_time = auth_timeout*2; + + /* Pre conditions to run the function, that must be met both in case + * of an automatic or manual failover: + * 1) We are a slave. + * 2) Our master is flagged as FAIL, or this is a manual failover. + * 3) We don't have the no failover configuration set, and this is + * not a manual failover. + * 4) It is serving slots. */ + if (clusterNodeIsMaster(myself) || + myself->slaveof == NULL || + (!nodeFailed(myself->slaveof) && !manual_failover) || + (server.cluster_slave_no_failover && !manual_failover) || + myself->slaveof->numslots == 0) + { + /* There are no reasons to failover, so we set the reason why we + * are returning without failing over to NONE. */ + server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; + return; + } + + /* Set data_age to the number of milliseconds we are disconnected from + * the master. */ + if (server.repl_state == REPL_STATE_CONNECTED) { + data_age = (mstime_t)(server.unixtime - server.master->lastinteraction) + * 1000; + } else { + data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; + } + + /* Remove the node timeout from the data age as it is fine that we are + * disconnected from our master at least for the time it was down to be + * flagged as FAIL, that's the baseline. */ + if (data_age > server.cluster_node_timeout) + data_age -= server.cluster_node_timeout; + + /* Check if our data is recent enough according to the slave validity + * factor configured by the user. + * + * Check bypassed for manual failovers. */ + if (server.cluster_slave_validity_factor && + data_age > + (((mstime_t)server.repl_ping_slave_period * 1000) + + (server.cluster_node_timeout * server.cluster_slave_validity_factor))) + { + if (!manual_failover) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE); + return; + } + } + + /* If the previous failover attempt timeout and the retry time has + * elapsed, we can setup a new one. */ + if (auth_age > auth_retry_time) { + server.cluster->failover_auth_time = mstime() + + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ + random() % 500; /* Random delay between 0 and 500 milliseconds. */ + server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_sent = 0; + server.cluster->failover_auth_rank = clusterGetSlaveRank(); + /* We add another delay that is proportional to the slave rank. + * Specifically 1 second * rank. This way slaves that have a probably + * less updated replication offset, are penalized. */ + server.cluster->failover_auth_time += + server.cluster->failover_auth_rank * 1000; + /* However if this is a manual failover, no delay is needed. */ + if (server.cluster->mf_end) { + server.cluster->failover_auth_time = mstime(); + server.cluster->failover_auth_rank = 0; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } + serverLog(LL_NOTICE, + "Start of election delayed for %lld milliseconds " + "(rank #%d, offset %lld).", + server.cluster->failover_auth_time - mstime(), + server.cluster->failover_auth_rank, + replicationGetSlaveOffset()); + /* Now that we have a scheduled election, broadcast our offset + * to all the other slaves so that they'll updated their offsets + * if our offset is better. */ + clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES); + return; + } + + /* It is possible that we received more updated offsets from other + * slaves for the same master since we computed our election delay. + * Update the delay if our rank changed. + * + * Not performed if this is a manual failover. */ + if (server.cluster->failover_auth_sent == 0 && + server.cluster->mf_end == 0) + { + int newrank = clusterGetSlaveRank(); + if (newrank > server.cluster->failover_auth_rank) { + long long added_delay = + (newrank - server.cluster->failover_auth_rank) * 1000; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_auth_rank = newrank; + serverLog(LL_NOTICE, + "Replica rank updated to #%d, added %lld milliseconds of delay.", + newrank, added_delay); + } + } + + /* Return ASAP if we can't still start the election. */ + if (mstime() < server.cluster->failover_auth_time) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); + return; + } + + /* Return ASAP if the election is too old to be valid. */ + if (auth_age > auth_timeout) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED); + return; + } + + /* Ask for votes if needed. */ + if (server.cluster->failover_auth_sent == 0) { + server.cluster->currentEpoch++; + server.cluster->failover_auth_epoch = server.cluster->currentEpoch; + serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.", + (unsigned long long) server.cluster->currentEpoch); + clusterRequestFailoverAuth(); + server.cluster->failover_auth_sent = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + return; /* Wait for replies. */ + } + + /* Check if we reached the quorum. */ + if (server.cluster->failover_auth_count >= needed_quorum) { + /* We have the quorum, we can finally failover the master. */ + + serverLog(LL_NOTICE, + "Failover election won: I'm the new master."); + + /* Update my configEpoch to the epoch of the election. */ + if (myself->configEpoch < server.cluster->failover_auth_epoch) { + myself->configEpoch = server.cluster->failover_auth_epoch; + serverLog(LL_NOTICE, + "configEpoch set to %llu after successful failover", + (unsigned long long) myself->configEpoch); + } + + /* Take responsibility for the cluster slots. */ + clusterFailoverReplaceYourMaster(); + } else { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER slave migration + * + * Slave migration is the process that allows a slave of a master that is + * already covered by at least another slave, to "migrate" to a master that + * is orphaned, that is, left with no working slaves. + * ------------------------------------------------------------------------- */ + +/* This function is responsible to decide if this replica should be migrated + * to a different (orphaned) master. It is called by the clusterCron() function + * only if: + * + * 1) We are a slave node. + * 2) It was detected that there is at least one orphaned master in + * the cluster. + * 3) We are a slave of one of the masters with the greatest number of + * slaves. + * + * This checks are performed by the caller since it requires to iterate + * the nodes anyway, so we spend time into clusterHandleSlaveMigration() + * if definitely needed. + * + * The function is called with a pre-computed max_slaves, that is the max + * number of working (not in FAIL state) slaves for a single master. + * + * Additional conditions for migration are examined inside the function. + */ +void clusterHandleSlaveMigration(int max_slaves) { + int j, okslaves = 0; + clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL; + dictIterator *di; + dictEntry *de; + + /* Step 1: Don't migrate if the cluster state is not ok. */ + if (server.cluster->state != CLUSTER_OK) return; + + /* Step 2: Don't migrate if my master will not be left with at least + * 'migration-barrier' slaves after my migration. */ + if (mymaster == NULL) return; + for (j = 0; j < mymaster->numslaves; j++) + if (!nodeFailed(mymaster->slaves[j]) && + !nodeTimedOut(mymaster->slaves[j])) okslaves++; + if (okslaves <= server.cluster_migration_barrier) return; + + /* Step 3: Identify a candidate for migration, and check if among the + * masters with the greatest number of ok slaves, I'm the one with the + * smallest node ID (the "candidate slave"). + * + * Note: this means that eventually a replica migration will occur + * since slaves that are reachable again always have their FAIL flag + * cleared, so eventually there must be a candidate. + * There is a possible race condition causing multiple + * slaves to migrate at the same time, but this is unlikely to + * happen and relatively harmless when it does. */ + candidate = myself; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + int okslaves = 0, is_orphaned = 1; + + /* We want to migrate only if this master is working, orphaned, and + * used to have slaves or if failed over a master that had slaves + * (MIGRATE_TO flag). This way we only migrate to instances that were + * supposed to have replicas. */ + if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0; + if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0; + + /* Check number of working slaves. */ + if (clusterNodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); + if (okslaves > 0) is_orphaned = 0; + + if (is_orphaned) { + if (!target && node->numslots > 0) target = node; + + /* Track the starting time of the orphaned condition for this + * master. */ + if (!node->orphaned_time) node->orphaned_time = mstime(); + } else { + node->orphaned_time = 0; + } + + /* Check if I'm the slave candidate for the migration: attached + * to a master with the maximum number of slaves and with the smallest + * node ID. */ + if (okslaves == max_slaves) { + for (j = 0; j < node->numslaves; j++) { + if (memcmp(node->slaves[j]->name, + candidate->name, + CLUSTER_NAMELEN) < 0) + { + candidate = node->slaves[j]; + } + } + } + } + dictReleaseIterator(di); + + /* Step 4: perform the migration if there is a target, and if I'm the + * candidate, but only if the master is continuously orphaned for a + * couple of seconds, so that during failovers, we give some time to + * the natural slaves of this instance to advertise their switch from + * the old master to the new one. */ + if (target && candidate == myself && + (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY && + !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + { + serverLog(LL_NOTICE,"Migrating to orphaned master %.40s", + target->name); + clusterSetMaster(target); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER manual failover + * + * This are the important steps performed by slaves during a manual failover: + * 1) User send CLUSTER FAILOVER command. The failover state is initialized + * setting mf_end to the millisecond unix time at which we'll abort the + * attempt. + * 2) Slave sends a MFSTART message to the master requesting to pause clients + * for two times the manual failover timeout CLUSTER_MF_TIMEOUT. + * When master is paused for manual failover, it also starts to flag + * packets with CLUSTERMSG_FLAG0_PAUSED. + * 3) Slave waits for master to send its replication offset flagged as PAUSED. + * 4) If slave received the offset from the master, and its offset matches, + * mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform + * the failover as usually, with the difference that the vote request + * will be modified to force masters to vote for a slave that has a + * working master. + * + * From the point of view of the master things are simpler: when a + * PAUSE_CLIENTS packet is received the master sets mf_end as well and + * the sender in mf_slave. During the time limit for the manual failover + * the master will just send PINGs more often to this slave, flagged with + * the PAUSED flag, so that the slave will set mf_master_offset when receiving + * a packet from the master with this flag set. + * + * The goal of the manual failover is to perform a fast failover without + * data loss due to the asynchronous master-slave replication. + * -------------------------------------------------------------------------- */ + +/* Reset the manual failover state. This works for both masters and slaves + * as all the state about manual failover is cleared. + * + * The function can be used both to initialize the manual failover state at + * startup or to abort a manual failover in progress. */ +void resetManualFailover(void) { + if (server.cluster->mf_slave) { + /* We were a master failing over, so we paused clients and related actions. + * Regardless of the outcome we unpause now to allow traffic again. */ + unpauseActions(PAUSE_DURING_FAILOVER); + } + server.cluster->mf_end = 0; /* No manual failover in progress. */ + server.cluster->mf_can_start = 0; + server.cluster->mf_slave = NULL; + server.cluster->mf_master_offset = -1; +} + +/* If a manual failover timed out, abort it. */ +void manualFailoverCheckTimeout(void) { + if (server.cluster->mf_end && server.cluster->mf_end < mstime()) { + serverLog(LL_WARNING,"Manual failover timed out."); + resetManualFailover(); + } +} + +/* This function is called from the cluster cron function in order to go + * forward with a manual failover state machine. */ +void clusterHandleManualFailover(void) { + /* Return ASAP if no manual failover is in progress. */ + if (server.cluster->mf_end == 0) return; + + /* If mf_can_start is non-zero, the failover was already triggered so the + * next steps are performed by clusterHandleSlaveFailover(). */ + if (server.cluster->mf_can_start) return; + + if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */ + + if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) { + /* Our replication offset matches the master replication offset + * announced after clients were paused. We can start the failover. */ + server.cluster->mf_can_start = 1; + serverLog(LL_NOTICE, + "All master replication stream processed, " + "manual failover can start."); + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + return; + } + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER cron job + * -------------------------------------------------------------------------- */ + +/* Check if the node is disconnected and re-establish the connection. + * Also update a few stats while we are here, that can be used to make + * better decisions in other part of the code. */ +static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { + /* Not interested in reconnecting the link with myself or nodes + * for which we have no address. */ + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1; + + if (node->flags & CLUSTER_NODE_PFAIL) + server.cluster->stats_pfail_nodes++; + + /* A Node in HANDSHAKE state has a limited lifespan equal to the + * configured node timeout. */ + if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { + clusterDelNode(node); + return 1; + } + + if (node->link == NULL) { + clusterLink *link = createClusterLink(node); + link->conn = connCreate(connTypeOfCluster()); + connSetPrivateData(link->conn, link); + if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, + clusterLinkConnectHandler) == C_ERR) { + /* We got a synchronous error from connect before + * clusterSendPing() had a chance to be called. + * If node->ping_sent is zero, failure detection can't work, + * so we claim we actually sent a ping now (that will + * be really sent as soon as the link is obtained). */ + if (node->ping_sent == 0) node->ping_sent = mstime(); + serverLog(LL_DEBUG, "Unable to connect to " + "Cluster Node [%s]:%d -> %s", node->ip, + node->cport, server.neterr); + + freeClusterLink(link); + return 0; + } + } + return 0; +} + +static void freeClusterLinkOnBufferLimitReached(clusterLink *link) { + if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) { + return; + } + + unsigned long long mem_link = link->send_msg_queue_mem; + if (mem_link > server.cluster_link_msg_queue_limit_bytes) { + serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to " + "exceeding send buffer memory limit.", link->inbound ? "from" : "to", + link->node ? link->node->name : "", mem_link); + freeClusterLink(link); + server.cluster->stat_cluster_links_buffer_limit_exceeded++; + } +} + +/* Free outbound link to a node if its send buffer size exceeded limit. */ +static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) { + freeClusterLinkOnBufferLimitReached(node->link); + freeClusterLinkOnBufferLimitReached(node->inbound_link); +} + +/* This is executed 10 times every second */ +void clusterCron(void) { + dictIterator *di; + dictEntry *de; + int update_state = 0; + int orphaned_masters; /* How many masters there are without ok slaves. */ + int max_slaves; /* Max number of ok slaves for a single master. */ + int this_slaves; /* Number of ok slaves for our master (if we are slave). */ + mstime_t min_pong = 0, now = mstime(); + clusterNode *min_pong_node = NULL; + static unsigned long long iteration = 0; + mstime_t handshake_timeout; + + iteration++; /* Number of times this function was called so far. */ + + clusterUpdateMyselfHostname(); + + /* The handshake timeout is the time after which a handshake node that was + * not turned into a normal node is removed from the nodes. Usually it is + * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use + * the value of 1 second. */ + handshake_timeout = server.cluster_node_timeout; + if (handshake_timeout < 1000) handshake_timeout = 1000; + + /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ + server.cluster->stats_pfail_nodes = 0; + /* Run through some of the operations we want to do on each cluster node. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + /* We free the inbound or outboud link to the node if the link has an + * oversized message send queue and immediately try reconnecting. */ + clusterNodeCronFreeLinkOnBufferLimitReached(node); + /* The protocol is that function(s) below return non-zero if the node was + * terminated. + */ + if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; + } + dictReleaseIterator(di); + + /* Ping some random node 1 time every 10 iterations, so that we usually ping + * one random node every second. */ + if (!(iteration % 10)) { + int j; + + /* Check a few random nodes and ping the one with the oldest + * pong_received time. */ + for (j = 0; j < 5; j++) { + de = dictGetRandomKey(server.cluster->nodes); + clusterNode *this = dictGetVal(de); + + /* Don't ping nodes disconnected or with a ping currently active. */ + if (this->link == NULL || this->ping_sent != 0) continue; + if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + if (min_pong_node == NULL || min_pong > this->pong_received) { + min_pong_node = this; + min_pong = this->pong_received; + } + } + if (min_pong_node) { + serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name); + clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); + } + } + + /* Iterate nodes to check if we need to flag something as failing. + * This loop is also responsible to: + * 1) Check if there are orphaned masters (masters without non failing + * slaves). + * 2) Count the max number of non failing slaves for a single master. + * 3) Count the number of slaves for our master, if we are a slave. */ + orphaned_masters = 0; + max_slaves = 0; + this_slaves = 0; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + now = mstime(); /* Use an updated time at every iteration. */ + + if (node->flags & + (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) + continue; + + /* Orphaned master check, useful only if the current instance + * is a slave that may migrate to another master. */ + if (nodeIsSlave(myself) && clusterNodeIsMaster(node) && !nodeFailed(node)) { + int okslaves = clusterCountNonFailingSlaves(node); + + /* A master is orphaned if it is serving a non-zero number of + * slots, have no working slaves, but used to have at least one + * slave, or failed over a master that used to have slaves. */ + if (okslaves == 0 && node->numslots > 0 && + node->flags & CLUSTER_NODE_MIGRATE_TO) + { + orphaned_masters++; + } + if (okslaves > max_slaves) max_slaves = okslaves; + if (myself->slaveof == node) + this_slaves = okslaves; + } + + /* If we are not receiving any data for more than half the cluster + * timeout, reconnect the link: maybe there is a connection + * issue even if the node is alive. */ + mstime_t ping_delay = now - node->ping_sent; + mstime_t data_delay = now - node->data_received; + if (node->link && /* is connected */ + now - node->link->ctime > + server.cluster_node_timeout && /* was not already reconnected */ + node->ping_sent && /* we already sent a ping */ + /* and we are waiting for the pong more than timeout/2 */ + ping_delay > server.cluster_node_timeout/2 && + /* and in such interval we are not seeing any traffic at all. */ + data_delay > server.cluster_node_timeout/2) + { + /* Disconnect the link, it will be reconnected automatically. */ + freeClusterLink(node->link); + } + + /* If we have currently no active ping in this instance, and the + * received PONG is older than half the cluster timeout, send + * a new ping now, to ensure all the nodes are pinged without + * a too big delay. */ + mstime_t ping_interval = server.cluster_ping_interval ? + server.cluster_ping_interval : server.cluster_node_timeout/2; + if (node->link && + node->ping_sent == 0 && + (now - node->pong_received) > ping_interval) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + + /* If we are a master and one of the slaves requested a manual + * failover, ping it continuously. */ + if (server.cluster->mf_end && + clusterNodeIsMaster(myself) && + server.cluster->mf_slave == node && + node->link) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + + /* Check only if we have an active ping for this instance. */ + if (node->ping_sent == 0) continue; + + /* Check if this node looks unreachable. + * Note that if we already received the PONG, then node->ping_sent + * is zero, so can't reach this code at all, so we don't risk of + * checking for a PONG delay if we didn't sent the PING. + * + * We also consider every incoming data as proof of liveness, since + * our cluster bus link is also used for data: under heavy data + * load pong delays are possible. */ + mstime_t node_delay = (ping_delay < data_delay) ? ping_delay : + data_delay; + + if (node_delay > server.cluster_node_timeout) { + /* Timeout reached. Set the node as possibly failing if it is + * not already in this state. */ + if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) { + serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", + node->name); + node->flags |= CLUSTER_NODE_PFAIL; + update_state = 1; + } + } + } + dictReleaseIterator(di); + + /* If we are a slave node but the replication is still turned off, + * enable it if we know the address of our master and it appears to + * be up. */ + if (nodeIsSlave(myself) && + server.masterhost == NULL && + myself->slaveof && + nodeHasAddr(myself->slaveof)) + { + replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof)); + } + + /* Abort a manual failover if the timeout is reached. */ + manualFailoverCheckTimeout(); + + if (nodeIsSlave(myself)) { + clusterHandleManualFailover(); + if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + clusterHandleSlaveFailover(); + /* If there are orphaned slaves, and we are a slave among the masters + * with the max number of non-failing slaves, consider migrating to + * the orphaned masters. Note that it does not make sense to try + * a migration if there is no master with at least *two* working + * slaves. */ + if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves && + server.cluster_allow_replica_migration) + clusterHandleSlaveMigration(max_slaves); + } + + if (update_state || server.cluster->state == CLUSTER_FAIL) + clusterUpdateState(); +} + +/* This function is called before the event handler returns to sleep for + * events. It is useful to perform operations that must be done ASAP in + * reaction to events fired but that are not safe to perform inside event + * handlers, or to perform potentially expansive tasks that we need to do + * a single time before replying to clients. */ +void clusterBeforeSleep(void) { + int flags = server.cluster->todo_before_sleep; + + /* Reset our flags (not strictly needed since every single function + * called for flags set should be able to clear its flag). */ + server.cluster->todo_before_sleep = 0; + + if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) { + /* Handle manual failover as soon as possible so that won't have a 100ms + * as it was handled only in clusterCron */ + if(nodeIsSlave(myself)) { + clusterHandleManualFailover(); + if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + clusterHandleSlaveFailover(); + } + } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) { + /* Handle failover, this is needed when it is likely that there is already + * the quorum from masters in order to react fast. */ + clusterHandleSlaveFailover(); + } + + /* Update the cluster state. */ + if (flags & CLUSTER_TODO_UPDATE_STATE) + clusterUpdateState(); + + /* Save the config, possibly using fsync. */ + if (flags & CLUSTER_TODO_SAVE_CONFIG) { + int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; + clusterSaveConfigOrDie(fsync); + } +} + +void clusterDoBeforeSleep(int flags) { + server.cluster->todo_before_sleep |= flags; +} + +/* ----------------------------------------------------------------------------- + * Slots management + * -------------------------------------------------------------------------- */ + +/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set, + * otherwise 0. */ +int bitmapTestBit(unsigned char *bitmap, int pos) { + off_t byte = pos/8; + int bit = pos&7; + return (bitmap[byte] & (1<nodes); + dictEntry *de; + int slaves = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (nodeIsSlave(node)) continue; + slaves += node->numslaves; + } + dictReleaseIterator(di); + return slaves != 0; +} + +/* Set the slot bit and return the old value. */ +int clusterNodeSetSlotBit(clusterNode *n, int slot) { + int old = bitmapTestBit(n->slots,slot); + if (!old) { + bitmapSetBit(n->slots,slot); + n->numslots++; + /* When a master gets its first slot, even if it has no slaves, + * it gets flagged with MIGRATE_TO, that is, the master is a valid + * target for replicas migration, if and only if at least one of + * the other masters has slaves right now. + * + * Normally masters are valid targets of replica migration if: + * 1. The used to have slaves (but no longer have). + * 2. They are slaves failing over a master that used to have slaves. + * + * However new masters with slots assigned are considered valid + * migration targets if the rest of the cluster is not a slave-less. + * + * See https://github.com/redis/redis/issues/3043 for more info. */ + if (n->numslots == 1 && clusterMastersHaveSlaves()) + n->flags |= CLUSTER_NODE_MIGRATE_TO; + } + return old; +} + +/* Clear the slot bit and return the old value. */ +int clusterNodeClearSlotBit(clusterNode *n, int slot) { + int old = bitmapTestBit(n->slots,slot); + if (old) { + bitmapClearBit(n->slots,slot); + n->numslots--; + } + return old; +} + +/* Return the slot bit from the cluster node structure. */ +int clusterNodeCoversSlot(clusterNode *n, int slot) { + return bitmapTestBit(n->slots,slot); +} + +/* Add the specified slot to the list of slots that node 'n' will + * serve. Return C_OK if the operation ended with success. + * If the slot is already assigned to another instance this is considered + * an error and C_ERR is returned. */ +int clusterAddSlot(clusterNode *n, int slot) { + if (server.cluster->slots[slot]) return C_ERR; + clusterNodeSetSlotBit(n,slot); + server.cluster->slots[slot] = n; + return C_OK; +} + +/* Delete the specified slot marking it as unassigned. + * Returns C_OK if the slot was assigned, otherwise if the slot was + * already unassigned C_ERR is returned. */ +int clusterDelSlot(int slot) { + clusterNode *n = server.cluster->slots[slot]; + + if (!n) return C_ERR; + + /* Cleanup the channels in master/replica as part of slot deletion. */ + removeChannelsInSlot(slot); + /* Clear the slot bit. */ + serverAssert(clusterNodeClearSlotBit(n,slot) == 1); + server.cluster->slots[slot] = NULL; + /* Make owner_not_claiming_slot flag consistent with slot ownership information. */ + bitmapClearBit(server.cluster->owner_not_claiming_slot, slot); + return C_OK; +} + +/* Delete all the slots associated with the specified node. + * The number of deleted slots is returned. */ +int clusterDelNodeSlots(clusterNode *node) { + int deleted = 0, j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (clusterNodeCoversSlot(node, j)) { + clusterDelSlot(j); + deleted++; + } + } + return deleted; +} + +/* Clear the migrating / importing state for all the slots. + * This is useful at initialization and when turning a master into slave. */ +void clusterCloseAllSlots(void) { + memset(server.cluster->migrating_slots_to,0, + sizeof(server.cluster->migrating_slots_to)); + memset(server.cluster->importing_slots_from,0, + sizeof(server.cluster->importing_slots_from)); +} + +/* ----------------------------------------------------------------------------- + * Cluster state evaluation function + * -------------------------------------------------------------------------- */ + +/* The following are defines that are only used in the evaluation function + * and are based on heuristics. Actually the main point about the rejoin and + * writable delay is that they should be a few orders of magnitude larger + * than the network latency. */ +#define CLUSTER_MAX_REJOIN_DELAY 5000 +#define CLUSTER_MIN_REJOIN_DELAY 500 +#define CLUSTER_WRITABLE_DELAY 2000 + +void clusterUpdateState(void) { + int j, new_state; + int reachable_masters = 0; + static mstime_t among_minority_time; + static mstime_t first_call_time = 0; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE; + + /* If this is a master node, wait some time before turning the state + * into OK, since it is not a good idea to rejoin the cluster as a writable + * master, after a reboot, without giving the cluster a chance to + * reconfigure this node. Note that the delay is calculated starting from + * the first call to this function and not since the server start, in order + * to not count the DB loading time. */ + if (first_call_time == 0) first_call_time = mstime(); + if (clusterNodeIsMaster(myself) && + server.cluster->state == CLUSTER_FAIL && + mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return; + + /* Start assuming the state is OK. We'll turn it into FAIL if there + * are the right conditions. */ + new_state = CLUSTER_OK; + + /* Check if all the slots are covered. */ + if (server.cluster_require_full_coverage) { + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->slots[j] == NULL || + server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) + { + new_state = CLUSTER_FAIL; + break; + } + } + } + + /* Compute the cluster size, that is the number of master nodes + * serving at least a single slot. + * + * At the same time count the number of reachable masters having + * at least one slot. */ + { + dictIterator *di; + dictEntry *de; + + server.cluster->size = 0; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (clusterNodeIsMaster(node) && node->numslots) { + server.cluster->size++; + if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0) + reachable_masters++; + } + } + dictReleaseIterator(di); + } + + /* If we are in a minority partition, change the cluster state + * to FAIL. */ + { + int needed_quorum = (server.cluster->size / 2) + 1; + + if (reachable_masters < needed_quorum) { + new_state = CLUSTER_FAIL; + among_minority_time = mstime(); + } + } + + /* Log a state change */ + if (new_state != server.cluster->state) { + mstime_t rejoin_delay = server.cluster_node_timeout; + + /* If the instance is a master and was partitioned away with the + * minority, don't let it accept queries for some time after the + * partition heals, to make sure there is enough time to receive + * a configuration update. */ + if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY) + rejoin_delay = CLUSTER_MAX_REJOIN_DELAY; + if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY) + rejoin_delay = CLUSTER_MIN_REJOIN_DELAY; + + if (new_state == CLUSTER_OK && + clusterNodeIsMaster(myself) && + mstime() - among_minority_time < rejoin_delay) + { + return; + } + + /* Change the state and log the event. */ + serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, + "Cluster state changed: %s", + new_state == CLUSTER_OK ? "ok" : "fail"); + server.cluster->state = new_state; + } +} + +/* This function is called after the node startup in order to verify that data + * loaded from disk is in agreement with the cluster configuration: + * + * 1) If we find keys about hash slots we have no responsibility for, the + * following happens: + * A) If no other node is in charge according to the current cluster + * configuration, we add these slots to our node. + * B) If according to our config other nodes are already in charge for + * this slots, we set the slots as IMPORTING from our point of view + * in order to justify we have those slots, and in order to make + * redis-cli aware of the issue, so that it can try to fix it. + * 2) If we find data in a DB different than DB0 we return C_ERR to + * signal the caller it should quit the server with an error message + * or take other actions. + * + * The function always returns C_OK even if it will try to correct + * the error described in "1". However if data is found in DB different + * from DB0, C_ERR is returned. + * + * The function also uses the logging facility in order to warn the user + * about desynchronizations between the data we have in memory and the + * cluster configuration. */ +int verifyClusterConfigWithData(void) { + int j; + int update_config = 0; + + /* Return ASAP if a module disabled cluster redirections. In that case + * every master can store keys about every possible hash slot. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return C_OK; + + /* If this node is a slave, don't perform the check at all as we + * completely depend on the replication stream. */ + if (nodeIsSlave(myself)) return C_OK; + + /* Make sure we only have keys in DB0. */ + for (j = 1; j < server.dbnum; j++) { + if (dbSize(&server.db[j], DB_MAIN)) return C_ERR; + } + + /* Check that all the slots we see populated memory have a corresponding + * entry in the cluster table. Otherwise fix the table. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (!countKeysInSlot(j)) continue; /* No keys in this slot. */ + /* Check if we are assigned to this slot or if we are importing it. + * In both cases check the next slot as the configuration makes + * sense. */ + if (server.cluster->slots[j] == myself || + server.cluster->importing_slots_from[j] != NULL) continue; + + /* If we are here data and cluster config don't agree, and we have + * slot 'j' populated even if we are not importing it, nor we are + * assigned to this slot. Fix this condition. */ + + update_config++; + /* Case A: slot is unassigned. Take responsibility for it. */ + if (server.cluster->slots[j] == NULL) { + serverLog(LL_NOTICE, "I have keys for unassigned slot %d. " + "Taking responsibility for it.",j); + clusterAddSlot(myself,j); + } else { + serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is " + "assigned to another node. " + "Setting it to importing state.",j); + server.cluster->importing_slots_from[j] = server.cluster->slots[j]; + } + } + if (update_config) clusterSaveConfigOrDie(1); + return C_OK; +} + +/* Remove all the shard channel related information not owned by the current shard. */ +static inline void removeAllNotOwnedShardChannelSubscriptions(void) { + if (!dictSize(server.pubsubshard_channels)) return; + clusterNode *currmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; + for (int j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->slots[j] != currmaster) { + removeChannelsInSlot(j); + } + } +} + +/* ----------------------------------------------------------------------------- + * SLAVE nodes handling + * -------------------------------------------------------------------------- */ + +/* Set the specified node 'n' as master for this node. + * If this node is currently a master, it is turned into a slave. */ +void clusterSetMaster(clusterNode *n) { + serverAssert(n != myself); + serverAssert(myself->numslots == 0); + + if (clusterNodeIsMaster(myself)) { + myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO); + myself->flags |= CLUSTER_NODE_SLAVE; + clusterCloseAllSlots(); + } else { + if (myself->slaveof) + clusterNodeRemoveSlave(myself->slaveof,myself); + } + myself->slaveof = n; + updateShardId(myself, n->shard_id); + clusterNodeAddSlave(n,myself); + replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n)); + removeAllNotOwnedShardChannelSubscriptions(); + resetManualFailover(); +} + +/* ----------------------------------------------------------------------------- + * Nodes to string representation functions. + * -------------------------------------------------------------------------- */ + +struct redisNodeFlags { + uint16_t flag; + char *name; +}; + +static struct redisNodeFlags redisNodeFlagsTable[] = { + {CLUSTER_NODE_MYSELF, "myself,"}, + {CLUSTER_NODE_MASTER, "master,"}, + {CLUSTER_NODE_SLAVE, "slave,"}, + {CLUSTER_NODE_PFAIL, "fail?,"}, + {CLUSTER_NODE_FAIL, "fail,"}, + {CLUSTER_NODE_HANDSHAKE, "handshake,"}, + {CLUSTER_NODE_NOADDR, "noaddr,"}, + {CLUSTER_NODE_NOFAILOVER, "nofailover,"} +}; + +/* Concatenate the comma separated list of node flags to the given SDS + * string 'ci'. */ +sds representClusterNodeFlags(sds ci, uint16_t flags) { + size_t orig_len = sdslen(ci); + int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags); + for (i = 0; i < size; i++) { + struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i; + if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name); + } + /* If no flag was added, add the "noflags" special flag. */ + if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,"); + sdsIncrLen(ci,-1); /* Remove trailing comma. */ + return ci; +} + +/* Concatenate the slot ownership information to the given SDS string 'ci'. + * If the slot ownership is in a contiguous block, it's represented as start-end pair, + * else each slot is added separately. */ +sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { + for (int i = 0; i< slot_info_pairs_count; i+=2) { + unsigned long start = slot_info_pairs[i]; + unsigned long end = slot_info_pairs[i+1]; + if (start == end) { + ci = sdscatfmt(ci, " %i", start); + } else { + ci = sdscatfmt(ci, " %i-%i", start, end); + } + } + return ci; +} + +/* Generate a csv-alike representation of the specified cluster node. + * See clusterGenNodesDescription() top comment for more information. + * + * The function returns the string representation as an SDS string. */ +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { + int j, start; + sds ci; + int port = clusterNodeClientPort(node, tls_primary); + + /* Node coordinates */ + ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN); + ci = sdscatfmt(ci," %s:%i@%i", + node->ip, + port, + node->cport); + if (sdslen(node->hostname) != 0) { + ci = sdscatfmt(ci,",%s", node->hostname); + } + /* Don't expose aux fields to any clients yet but do allow them + * to be persisted to nodes.conf */ + if (c == NULL) { + if (sdslen(node->hostname) == 0) { + ci = sdscatfmt(ci,",", 1); + } + for (int i = af_count-1; i >=0; i--) { + if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) { + continue; + } + if (auxFieldHandlers[i].isPresent(node)) { + ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field); + ci = auxFieldHandlers[i].getter(node, ci); + } + } + } + + /* Flags */ + ci = sdscatlen(ci," ",1); + ci = representClusterNodeFlags(ci, node->flags); + + /* Slave of... or just "-" */ + ci = sdscatlen(ci," ",1); + if (node->slaveof) + ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN); + else + ci = sdscatlen(ci,"-",1); + + unsigned long long nodeEpoch = node->configEpoch; + if (nodeIsSlave(node) && node->slaveof) { + nodeEpoch = node->slaveof->configEpoch; + } + /* Latency from the POV of this node, config epoch, link status */ + ci = sdscatfmt(ci," %I %I %U %s", + (long long) node->ping_sent, + (long long) node->pong_received, + nodeEpoch, + (node->link || node->flags & CLUSTER_NODE_MYSELF) ? + "connected" : "disconnected"); + + /* Slots served by this instance. If we already have slots info, + * append it directly, otherwise, generate slots only if it has. */ + if (node->slot_info_pairs) { + ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count); + } else if (node->numslots > 0) { + start = -1; + for (j = 0; j < CLUSTER_SLOTS; j++) { + int bit; + + if ((bit = clusterNodeCoversSlot(node, j)) != 0) { + if (start == -1) start = j; + } + if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) { + if (bit && j == CLUSTER_SLOTS-1) j++; + + if (start == j-1) { + ci = sdscatfmt(ci," %i",start); + } else { + ci = sdscatfmt(ci," %i-%i",start,j-1); + } + start = -1; + } + } + } + + /* Just for MYSELF node we also dump info about slots that + * we are migrating to other instances or importing from other + * instances. */ + if (node->flags & CLUSTER_NODE_MYSELF) { + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->migrating_slots_to[j]) { + ci = sdscatprintf(ci," [%d->-%.40s]",j, + server.cluster->migrating_slots_to[j]->name); + } else if (server.cluster->importing_slots_from[j]) { + ci = sdscatprintf(ci," [%d-<-%.40s]",j, + server.cluster->importing_slots_from[j]->name); + } + } + } + return ci; +} + +/* Generate the slot topology for all nodes and store the string representation + * in the slots_info struct on the node. This is used to improve the efficiency + * of clusterGenNodesDescription() because it removes looping of the slot space + * for generating the slot info for each node individually. */ +void clusterGenNodesSlotsInfo(int filter) { + clusterNode *n = NULL; + int start = -1; + + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = server.cluster->slots[i]; + start = i; + continue; + } + + /* Generate slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { + if (!(n->flags & filter)) { + if (!n->slot_info_pairs) { + n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t)); + } + serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots)); + n->slot_info_pairs[n->slot_info_pairs_count++] = start; + n->slot_info_pairs[n->slot_info_pairs_count++] = i-1; + } + if (i == CLUSTER_SLOTS) break; + n = server.cluster->slots[i]; + start = i; + } + } +} + +void clusterFreeNodesSlotsInfo(clusterNode *n) { + zfree(n->slot_info_pairs); + n->slot_info_pairs = NULL; + n->slot_info_pairs_count = 0; +} + +/* Generate a csv-alike representation of the nodes we are aware of, + * including the "myself" node, and return an SDS string containing the + * representation (it is up to the caller to free it). + * + * All the nodes matching at least one of the node flags specified in + * "filter" are excluded from the output, so using zero as a filter will + * include all the known nodes in the representation, including nodes in + * the HANDSHAKE state. + * + * Setting tls_primary to 1 to put TLS port in the main : + * field and put TCP port in aux field, instead of the opposite way. + * + * The representation obtained using this function is used for the output + * of the CLUSTER NODES function, and as format for the cluster + * configuration file (nodes.conf) for a given node. */ +sds clusterGenNodesDescription(client *c, int filter, int tls_primary) { + sds ci = sdsempty(), ni; + dictIterator *di; + dictEntry *de; + + /* Generate all nodes slots info firstly. */ + clusterGenNodesSlotsInfo(filter); + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node->flags & filter) continue; + ni = clusterGenNodeDescription(c, node, tls_primary); + ci = sdscatsds(ci,ni); + sdsfree(ni); + ci = sdscatlen(ci,"\n",1); + + /* Release slots info. */ + clusterFreeNodesSlotsInfo(node); + } + dictReleaseIterator(di); + return ci; +} + +/* Add to the output buffer of the given client the description of the given cluster link. + * The description is a map with each entry being an attribute of the link. */ +void addReplyClusterLinkDescription(client *c, clusterLink *link) { + addReplyMapLen(c, 6); + + addReplyBulkCString(c, "direction"); + addReplyBulkCString(c, link->inbound ? "from" : "to"); + + /* addReplyClusterLinkDescription is only called for links that have been + * associated with nodes. The association is always bi-directional, so + * in addReplyClusterLinkDescription, link->node should never be NULL. */ + serverAssert(link->node); + sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN); + addReplyBulkCString(c, "node"); + addReplyBulkCString(c, node_name); + sdsfree(node_name); + + addReplyBulkCString(c, "create-time"); + addReplyLongLong(c, link->ctime); + + char events[3], *p; + p = events; + if (link->conn) { + if (connHasReadHandler(link->conn)) *p++ = 'r'; + if (connHasWriteHandler(link->conn)) *p++ = 'w'; + } + *p = '\0'; + addReplyBulkCString(c, "events"); + addReplyBulkCString(c, events); + + addReplyBulkCString(c, "send-buffer-allocated"); + addReplyLongLong(c, link->send_msg_queue_mem); + + addReplyBulkCString(c, "send-buffer-used"); + addReplyLongLong(c, link->send_msg_queue_mem); +} + +/* Add to the output buffer of the given client an array of cluster link descriptions, + * with array entry being a description of a single current cluster link. */ +void addReplyClusterLinksDescription(client *c) { + dictIterator *di; + dictEntry *de; + void *arraylen_ptr = NULL; + int num_links = 0; + + arraylen_ptr = addReplyDeferredLen(c); + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->link) { + num_links++; + addReplyClusterLinkDescription(c, node->link); + } + if (node->inbound_link) { + num_links++; + addReplyClusterLinkDescription(c, node->inbound_link); + } + } + dictReleaseIterator(di); + + setDeferredArrayLen(c, arraylen_ptr, num_links); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER command + * -------------------------------------------------------------------------- */ + +const char *clusterGetMessageTypeString(int type) { + switch(type) { + case CLUSTERMSG_TYPE_PING: return "ping"; + case CLUSTERMSG_TYPE_PONG: return "pong"; + case CLUSTERMSG_TYPE_MEET: return "meet"; + case CLUSTERMSG_TYPE_FAIL: return "fail"; + case CLUSTERMSG_TYPE_PUBLISH: return "publish"; + case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard"; + case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req"; + case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack"; + case CLUSTERMSG_TYPE_UPDATE: return "update"; + case CLUSTERMSG_TYPE_MFSTART: return "mfstart"; + case CLUSTERMSG_TYPE_MODULE: return "module"; + } + return "unknown"; +} + +int getSlotOrReply(client *c, robj *o) { + long long slot; + + if (getLongLongFromObject(o,&slot) != C_OK || + slot < 0 || slot >= CLUSTER_SLOTS) + { + addReplyError(c,"Invalid or out of range slot"); + return -1; + } + return (int) slot; +} + +int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) { + int slot; + for (slot = start_slot; slot <= end_slot; slot++) { + if (del && server.cluster->slots[slot] == NULL) { + addReplyErrorFormat(c,"Slot %d is already unassigned", slot); + return C_ERR; + } else if (!del && server.cluster->slots[slot]) { + addReplyErrorFormat(c,"Slot %d is already busy", slot); + return C_ERR; + } + if (slots[slot]++ == 1) { + addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot); + return C_ERR; + } + } + return C_OK; +} + +void clusterUpdateSlots(client *c, unsigned char *slots, int del) { + int j; + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (slots[j]) { + int retval; + + /* If this slot was set as importing we can clear this + * state as now we are the real owner of the slot. */ + if (server.cluster->importing_slots_from[j]) + server.cluster->importing_slots_from[j] = NULL; + + retval = del ? clusterDelSlot(j) : + clusterAddSlot(myself,j); + serverAssertWithInfo(c,NULL,retval == C_OK); + } + } +} + +/* Add detailed information of a node to the output buffer of the given client. */ +void addNodeDetailsToShardReply(client *c, clusterNode *node) { + int reply_count = 0; + void *node_replylen = addReplyDeferredLen(c); + addReplyBulkCString(c, "id"); + addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); + reply_count++; + + if (node->tcp_port) { + addReplyBulkCString(c, "port"); + addReplyLongLong(c, node->tcp_port); + reply_count++; + } + + if (node->tls_port) { + addReplyBulkCString(c, "tls-port"); + addReplyLongLong(c, node->tls_port); + reply_count++; + } + + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, node->ip); + reply_count++; + + addReplyBulkCString(c, "endpoint"); + addReplyBulkCString(c, clusterNodePreferredEndpoint(node)); + reply_count++; + + if (sdslen(node->hostname) != 0) { + addReplyBulkCString(c, "hostname"); + addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); + reply_count++; + } + + long long node_offset; + if (node->flags & CLUSTER_NODE_MYSELF) { + node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset; + } else { + node_offset = node->repl_offset; + } + + addReplyBulkCString(c, "role"); + addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master"); + reply_count++; + + addReplyBulkCString(c, "replication-offset"); + addReplyLongLong(c, node_offset); + reply_count++; + + addReplyBulkCString(c, "health"); + const char *health_msg = NULL; + if (nodeFailed(node)) { + health_msg = "fail"; + } else if (nodeIsSlave(node) && node_offset == 0) { + health_msg = "loading"; + } else { + health_msg = "online"; + } + addReplyBulkCString(c, health_msg); + reply_count++; + + setDeferredMapLen(c, node_replylen, reply_count); +} + +/* Add the shard reply of a single shard based off the given primary node. */ +void addShardReplyForClusterShards(client *c, list *nodes) { + serverAssert(listLength(nodes) > 0); + clusterNode *n = listNodeValue(listFirst(nodes)); + addReplyMapLen(c, 2); + addReplyBulkCString(c, "slots"); + + /* Use slot_info_pairs from the primary only */ + while (n->slaveof != NULL) n = n->slaveof; + + if (n->slot_info_pairs != NULL) { + serverAssert((n->slot_info_pairs_count % 2) == 0); + addReplyArrayLen(c, n->slot_info_pairs_count); + for (int i = 0; i < n->slot_info_pairs_count; i++) + addReplyLongLong(c, (unsigned long)n->slot_info_pairs[i]); + } else { + /* If no slot info pair is provided, the node owns no slots */ + addReplyArrayLen(c, 0); + } + + addReplyBulkCString(c, "nodes"); + addReplyArrayLen(c, listLength(nodes)); + listIter li; + listRewind(nodes, &li); + for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) { + clusterNode *n = listNodeValue(ln); + addNodeDetailsToShardReply(c, n); + clusterFreeNodesSlotsInfo(n); + } +} + +/* Add to the output buffer of the given client, an array of slot (start, end) + * pair owned by the shard, also the primary and set of replica(s) along with + * information about each node. */ +void clusterCommandShards(client *c) { + addReplyArrayLen(c, dictSize(server.cluster->shards)); + /* This call will add slot_info_pairs to all nodes */ + clusterGenNodesSlotsInfo(0); + dictIterator *di = dictGetSafeIterator(server.cluster->shards); + for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) { + addShardReplyForClusterShards(c, dictGetVal(de)); + } + dictReleaseIterator(di); +} + +sds genClusterInfoString(void) { + sds info = sdsempty(); + char *statestr[] = {"ok","fail"}; + int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0; + uint64_t myepoch; + int j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + clusterNode *n = server.cluster->slots[j]; + + if (n == NULL) continue; + slots_assigned++; + if (nodeFailed(n)) { + slots_fail++; + } else if (nodeTimedOut(n)) { + slots_pfail++; + } else { + slots_ok++; + } + } + + myepoch = (nodeIsSlave(myself) && myself->slaveof) ? + myself->slaveof->configEpoch : myself->configEpoch; + + info = sdscatprintf(info, + "cluster_state:%s\r\n" + "cluster_slots_assigned:%d\r\n" + "cluster_slots_ok:%d\r\n" + "cluster_slots_pfail:%d\r\n" + "cluster_slots_fail:%d\r\n" + "cluster_known_nodes:%lu\r\n" + "cluster_size:%d\r\n" + "cluster_current_epoch:%llu\r\n" + "cluster_my_epoch:%llu\r\n" + , statestr[server.cluster->state], + slots_assigned, + slots_ok, + slots_pfail, + slots_fail, + dictSize(server.cluster->nodes), + server.cluster->size, + (unsigned long long) server.cluster->currentEpoch, + (unsigned long long) myepoch + ); + + /* Show stats about messages sent and received. */ + long long tot_msg_sent = 0; + long long tot_msg_received = 0; + + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + if (server.cluster->stats_bus_messages_sent[i] == 0) continue; + tot_msg_sent += server.cluster->stats_bus_messages_sent[i]; + info = sdscatprintf(info, + "cluster_stats_messages_%s_sent:%lld\r\n", + clusterGetMessageTypeString(i), + server.cluster->stats_bus_messages_sent[i]); + } + info = sdscatprintf(info, + "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent); + + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + if (server.cluster->stats_bus_messages_received[i] == 0) continue; + tot_msg_received += server.cluster->stats_bus_messages_received[i]; + info = sdscatprintf(info, + "cluster_stats_messages_%s_received:%lld\r\n", + clusterGetMessageTypeString(i), + server.cluster->stats_bus_messages_received[i]); + } + info = sdscatprintf(info, + "cluster_stats_messages_received:%lld\r\n", tot_msg_received); + + info = sdscatprintf(info, + "total_cluster_links_buffer_limit_exceeded:%llu\r\n", + server.cluster->stat_cluster_links_buffer_limit_exceeded); + + return info; +} + + +void removeChannelsInSlot(unsigned int slot) { + unsigned int channelcount = countChannelsInSlot(slot); + if (channelcount == 0) return; + + /* Retrieve all the channels for the slot. */ + robj **channels = zmalloc(sizeof(robj*)*channelcount); + raxIterator iter; + int j = 0; + unsigned char indexed[2]; + + indexed[0] = (slot >> 8) & 0xff; + indexed[1] = slot & 0xff; + raxStart(&iter,server.cluster->slots_to_channels); + raxSeek(&iter,">=",indexed,2); + while(raxNext(&iter)) { + if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; + channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); + } + raxStop(&iter); + + pubsubUnsubscribeShardChannels(channels, channelcount); + zfree(channels); +} + + + +/* Remove all the keys in the specified hash slot. + * The number of removed items is returned. */ +unsigned int delKeysInSlot(unsigned int hashslot) { + unsigned int j = 0; + + dictIterator *iter = NULL; + dictEntry *de = NULL; + iter = dictGetSafeIterator(server.db->dict[hashslot]); + while((de = dictNext(iter)) != NULL) { + sds sdskey = dictGetKey(de); + robj *key = createStringObject(sdskey, sdslen(sdskey)); + dbDelete(&server.db[0], key); + propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); + signalModifiedKey(NULL, &server.db[0], key); + /* The keys are not actually logically deleted from the database, just moved to another node. + * The modules needs to know that these keys are no longer available locally, so just send the + * keyspace notification to the modules, but not to clients. */ + moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + postExecutionUnitOperations(); + decrRefCount(key); + j++; + server.dirty++; + } + dictReleaseIterator(iter); + + return j; +} + +/* ----------------------------------------------------------------------------- + * Operation(s) on channel rax tree. + * -------------------------------------------------------------------------- */ + +void slotToChannelUpdate(sds channel, int add) { + size_t keylen = sdslen(channel); + unsigned int hashslot = keyHashSlot(channel,keylen); + unsigned char buf[64]; + unsigned char *indexed = buf; + + if (keylen+2 > 64) indexed = zmalloc(keylen+2); + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + memcpy(indexed+2,channel,keylen); + if (add) { + raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); + } else { + raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); + } + if (indexed != buf) zfree(indexed); +} + +void slotToChannelAdd(sds channel) { + slotToChannelUpdate(channel,1); +} + +void slotToChannelDel(sds channel) { + slotToChannelUpdate(channel,0); +} + +/* Get the count of the channels for a given slot. */ +unsigned int countChannelsInSlot(unsigned int hashslot) { + raxIterator iter; + int j = 0; + unsigned char indexed[2]; + + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + raxStart(&iter,server.cluster->slots_to_channels); + raxSeek(&iter,">=",indexed,2); + while(raxNext(&iter)) { + if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; + j++; + } + raxStop(&iter); + return j; +} + +int clusterNodeIsMyself(clusterNode *n) { + return n == server.cluster->myself; +} + +clusterNode *getMyClusterNode(void) { + return server.cluster->myself; +} + +int clusterManualFailoverTimeLimit(void) { + return server.cluster->mf_end; +} + +int getClusterSize(void) { + return dictSize(server.cluster->nodes); +} + +char **getClusterNodesList(size_t *numnodes) { + size_t count = dictSize(server.cluster->nodes); + char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); + dictIterator *di = dictGetIterator(server.cluster->nodes); + dictEntry *de; + int j = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; + ids[j] = zmalloc(CLUSTER_NAMELEN); + memcpy(ids[j],node->name,CLUSTER_NAMELEN); + j++; + } + *numnodes = j; + ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need + * to also get the count argument. */ + dictReleaseIterator(di); + return ids; +} + +int clusterNodeIsMaster(clusterNode *n) { + return n->flags & CLUSTER_NODE_MASTER; +} + +int handleDebugClusterCommand(client *c) { + if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || + strcasecmp(c->argv[2]->ptr, "KILL") || + c->argc != 5) { + return 0; + } + + if (!server.cluster_enabled) { + addReplyError(c, "Debug option only available for cluster mode enabled setup!"); + return 1; + } + + /* Find the node. */ + clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr); + return 1; + } + + /* Terminate the link based on the direction or all. */ + if (!strcasecmp(c->argv[3]->ptr, "from")) { + freeClusterLink(n->inbound_link); + } else if (!strcasecmp(c->argv[3]->ptr, "to")) { + freeClusterLink(n->link); + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + freeClusterLink(n->link); + freeClusterLink(n->inbound_link); + } else { + addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); + } + addReply(c, shared.ok); + + return 1; +} + +int clusterNodePending(clusterNode *node) { + return node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE); +} + +char *clusterNodeIp(clusterNode *node) { + return node->ip; +} + +int clusterNodeIsSlave(clusterNode *node) { + return node->flags & CLUSTER_NODE_SLAVE; +} + +clusterNode *clusterNodeGetSlaveof(clusterNode *node) { + return node->slaveof; +} + +char *clusterNodeGetName(clusterNode *node) { + return node->name; +} + +int clusterNodeTimedOut(clusterNode *node) { + return nodeTimedOut(node); +} + +int clusterNodeIsFailing(clusterNode *node) { + return nodeFailed(node); +} + +int clusterNodeIsNoFailover(clusterNode *node) { + return node->flags & CLUSTER_NODE_NOFAILOVER; +} + +const char **clusterDebugCommandExtendedHelp(void) { + static const char *help[] = { + "CLUSTERLINK KILL ", + " Kills the link based on the direction to/from (both) with the provided node.", + NULL + }; + + return help; +} + +char *clusterNodeGetShardId(clusterNode *node) { + return node->shard_id; +} + +int clusterCommandSpecial(client *c) { + if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { + /* CLUSTER MEET [cport] */ + long long port, cport; + + if (getLongLongFromObject(c->argv[3], &port) != C_OK) { + addReplyErrorFormat(c,"Invalid base port specified: %s", + (char*)c->argv[3]->ptr); + return 1; + } + + if (c->argc == 5) { + if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { + addReplyErrorFormat(c,"Invalid bus port specified: %s", + (char*)c->argv[4]->ptr); + return 1; + } + } else { + cport = port + CLUSTER_PORT_INCR; + } + + if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && + errno == EINVAL) + { + addReplyErrorFormat(c,"Invalid node address specified: %s:%s", + (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); + } else { + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { + /* CLUSTER FLUSHSLOTS */ + if (dbSize(&server.db[0], DB_MAIN) != 0) { + addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); + return 1; + } + clusterDelNodeSlots(myself); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || + !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) { + /* CLUSTER ADDSLOTS [slot] ... */ + /* CLUSTER DELSLOTS [slot] ... */ + int j, slot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslots"); + + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable.*/ + for (j = 2; j < c->argc; j++) { + if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + } + /* Check that the slots are not already busy. */ + for (j = 2; j < c->argc; j++) { + slot = getSlotOrReply(c,c->argv[j]); + if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || + !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { + if (c->argc % 2 == 1) { + addReplyErrorArity(c); + return 1; + } + /* CLUSTER ADDSLOTSRANGE [ ...] */ + /* CLUSTER DELSLOTSRANGE [ ...] */ + int j, startslot, endslot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); + + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable and that all the + * slots are not already busy. */ + for (j = 2; j < c->argc; j += 2) { + if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { + zfree(slots); + return 1; + } + if (startslot > endslot) { + addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); + zfree(slots); + return 1; + } + + if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { + /* SETSLOT 10 MIGRATING */ + /* SETSLOT 10 IMPORTING */ + /* SETSLOT 10 STABLE */ + /* SETSLOT 10 NODE */ + int slot; + clusterNode *n; + + if (nodeIsSlave(myself)) { + addReplyError(c,"Please use SETSLOT only with masters."); + return 1; + } + + if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 1; + + if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { + if (server.cluster->slots[slot] != myself) { + addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); + return 1; + } + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->migrating_slots_to[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { + if (server.cluster->slots[slot] == myself) { + addReplyErrorFormat(c, + "I'm already the owner of hash slot %u",slot); + return 1; + } + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->importing_slots_from[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { + /* CLUSTER SETSLOT STABLE */ + server.cluster->importing_slots_from[slot] = NULL; + server.cluster->migrating_slots_to[slot] = NULL; + } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { + /* CLUSTER SETSLOT NODE */ + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + /* If this hash slot was served by 'myself' before to switch + * make sure there are no longer local keys for this hash slot. */ + if (server.cluster->slots[slot] == myself && n != myself) { + if (countKeysInSlot(slot) != 0) { + addReplyErrorFormat(c, + "Can't assign hashslot %d to a different node " + "while I still hold keys for this hash slot.", slot); + return 1; + } + } + /* If this slot is in migrating status but we have no keys + * for it assigning the slot to another node will clear + * the migrating status. */ + if (countKeysInSlot(slot) == 0 && + server.cluster->migrating_slots_to[slot]) + server.cluster->migrating_slots_to[slot] = NULL; + + int slot_was_mine = server.cluster->slots[slot] == myself; + clusterDelSlot(slot); + clusterAddSlot(n,slot); + + /* If we are a master left without slots, we should turn into a + * replica of the new master. */ + if (slot_was_mine && + n != myself && + myself->numslots == 0 && + server.cluster_allow_replica_migration) { + serverLog(LL_NOTICE, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s (%s)", n->name, n->human_nodename); + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | + CLUSTER_TODO_UPDATE_STATE | + CLUSTER_TODO_FSYNC_CONFIG); + } + + /* If this node was importing this slot, assigning the slot to + * itself also clears the importing status. */ + if (n == myself && + server.cluster->importing_slots_from[slot]) { + /* This slot was manually migrated, set this node configEpoch + * to a new epoch so that the new version can be propagated + * by the cluster. + * + * Note that if this ever results in a collision with another + * node getting the same configEpoch, for example because a + * failover happens at the same time we close the slot, the + * configEpoch collision resolution will fix it assigning + * a different epoch to each node. */ + if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { + serverLog(LL_NOTICE, + "configEpoch updated after importing slot %d", slot); + } + server.cluster->importing_slots_from[slot] = NULL; + /* After importing this slot, let the other nodes know as + * soon as possible. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } + } else { + addReplyError(c, + "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); + return 1; + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { + /* CLUSTER BUMPEPOCH */ + int retval = clusterBumpConfigEpochWithoutConsensus(); + sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", + (retval == C_OK) ? "BUMPED" : "STILL", + (unsigned long long) myself->configEpoch); + addReplySds(c,reply); + } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { + int retval = clusterSaveConfig(1); + + if (retval == 0) + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"error saving the cluster node config: %s", + strerror(errno)); + } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { + /* CLUSTER FORGET */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + if (clusterBlacklistExists((char*)c->argv[2]->ptr)) + /* Already forgotten. The deletion may have been gossipped by + * another node, so we pretend it succeeded. */ + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else if (n == myself) { + addReplyError(c,"I tried hard but I can't forget myself..."); + return 1; + } else if (nodeIsSlave(myself) && myself->slaveof == n) { + addReplyError(c,"Can't forget my master!"); + return 1; + } + clusterBlacklistAddNode(n); + clusterDelNode(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { + /* CLUSTER REPLICATE */ + /* Lookup the specified node in our table. */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } + + /* I can't replicate myself. */ + if (n == myself) { + addReplyError(c,"Can't replicate myself"); + return 1; + } + + /* Can't replicate a slave. */ + if (nodeIsSlave(n)) { + addReplyError(c,"I can only replicate a master, not a replica."); + return 1; + } + + /* If the instance is currently a master, it should have no assigned + * slots nor keys to accept to replicate some other node. + * Slaves can switch to another master without issues. */ + if (clusterNodeIsMaster(myself) && + (myself->numslots != 0 || dbSize(&server.db[0], DB_MAIN) != 0)) { + addReplyError(c, + "To set a master the node must be empty and " + "without assigned slots."); + return 1; + } + + /* Set the master. */ + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && + c->argc == 3) + { + /* CLUSTER COUNT-FAILURE-REPORTS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else { + addReplyLongLong(c,clusterNodeFailureReportsCount(n)); + } + } else if (!strcasecmp(c->argv[1]->ptr,"failover") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ + int force = 0, takeover = 0; + + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"force")) { + force = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { + takeover = 1; + force = 1; /* Takeover also implies force. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } + + /* Check preconditions. */ + if (clusterNodeIsMaster(myself)) { + addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); + return 1; + } else if (myself->slaveof == NULL) { + addReplyError(c,"I'm a replica but my master is unknown to me"); + return 1; + } else if (!force && + (nodeFailed(myself->slaveof) || + myself->slaveof->link == NULL)) + { + addReplyError(c,"Master is down or failed, " + "please use CLUSTER FAILOVER FORCE"); + return 1; + } + resetManualFailover(); + server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; + + if (takeover) { + /* A takeover does not perform any initial check. It just + * generates a new configuration epoch for this node without + * consensus, claims the master's slots, and broadcast the new + * configuration. */ + serverLog(LL_NOTICE,"Taking over the master (user request)."); + clusterBumpConfigEpochWithoutConsensus(); + clusterFailoverReplaceYourMaster(); + } else if (force) { + /* If this is a forced failover, we don't need to talk with our + * master to agree about the offset. We just failover taking over + * it without coordination. */ + serverLog(LL_NOTICE,"Forced failover user request accepted."); + server.cluster->mf_can_start = 1; + } else { + serverLog(LL_NOTICE,"Manual failover user request accepted."); + clusterSendMFStart(myself->slaveof); + } + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) + { + /* CLUSTER SET-CONFIG-EPOCH + * + * The user is allowed to set the config epoch only when a node is + * totally fresh: no config epoch, no other known node, and so forth. + * This happens at cluster creation time to start with a cluster where + * every node has a different node ID, without to rely on the conflicts + * resolution system which is too slow when a big cluster is created. */ + long long epoch; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) + return 1; + + if (epoch < 0) { + addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); + } else if (dictSize(server.cluster->nodes) > 1) { + addReplyError(c,"The user can assign a config epoch only when the " + "node does not know any other node."); + } else if (myself->configEpoch != 0) { + addReplyError(c,"Node config epoch is already non-zero"); + } else { + myself->configEpoch = epoch; + serverLog(LL_NOTICE, + "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", + (unsigned long long) myself->configEpoch); + + if (server.cluster->currentEpoch < (uint64_t)epoch) + server.cluster->currentEpoch = epoch; + /* No need to fsync the config here since in the unlucky event + * of a failure to persist the config, the conflict resolution code + * will assign a unique config to this node. */ + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"reset") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER RESET [SOFT|HARD] */ + int hard = 0; + + /* Parse soft/hard argument. Default is soft. */ + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"hard")) { + hard = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { + hard = 0; + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } + + /* Slaves can be reset while containing data, but not master nodes + * that must be empty. */ + if (clusterNodeIsMaster(myself) && dbSize(c->db, DB_MAIN) != 0) { + addReplyError(c,"CLUSTER RESET can't be called with " + "master nodes containing keys"); + return 1; + } + clusterReset(hard); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { + /* CLUSTER LINKS */ + addReplyClusterLinksDescription(c); + } else { + return 0; + } + + return 1; +} + +const char **clusterCommandExtendedHelp(void) { + static const char *help[] = { + "ADDSLOTS [ ...]", + " Assign slots to current node.", + "ADDSLOTSRANGE [ ...]", + " Assign slots which are between and to current node.", + "BUMPEPOCH", + " Advance the cluster config epoch.", + "COUNT-FAILURE-REPORTS ", + " Return number of failure reports for .", + "DELSLOTS [ ...]", + " Delete slots information from current node.", + "DELSLOTSRANGE [ ...]", + " Delete slots information which are between and from current node.", + "FAILOVER [FORCE|TAKEOVER]", + " Promote current replica node to being a master.", + "FORGET ", + " Remove a node from the cluster.", + "FLUSHSLOTS", + " Delete current node own slots information.", + "MEET []", + " Connect nodes into a working cluster.", + "REPLICATE ", + " Configure current node as replica to .", + "RESET [HARD|SOFT]", + " Reset current node (default: soft).", + "SET-CONFIG-EPOCH ", + " Set config epoch of current node.", + "SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", + " Set slot state.", + "SAVECONFIG", + " Force saving cluster configuration on disk.", + "LINKS", + " Return information about all network links between this node and its peers.", + " Output format is an array where each array element is a map containing attributes of a link", + NULL + }; + + return help; +} + +int clusterNodeNumSlaves(clusterNode *node) { + return node->numslaves; +} + +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx) { + return node->slaves[slave_idx]; +} + +clusterNode *getMigratingSlotDest(int slot) { + return server.cluster->migrating_slots_to[slot]; +} + +clusterNode *getImportingSlotSource(int slot) { + return server.cluster->importing_slots_from[slot]; +} + +int isClusterHealthy(void) { + return server.cluster->state == CLUSTER_OK; +} + +clusterNode *getNodeBySlot(int slot) { + return server.cluster->slots[slot]; +} + +char *clusterNodeHostname(clusterNode *node) { + return node->hostname; +} + +long long clusterNodeReplOffset(clusterNode *node) { + return node->repl_offset; +} + +const char *clusterNodePreferredEndpoint(clusterNode *n) { + char *hostname = clusterNodeHostname(n); + switch (server.cluster_preferred_endpoint_type) { + case CLUSTER_ENDPOINT_TYPE_IP: + return clusterNodeIp(n); + case CLUSTER_ENDPOINT_TYPE_HOSTNAME: + return (hostname != NULL && hostname[0] != '\0') ? hostname : "?"; + case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: + return ""; + } + return "unknown"; +} + +int clusterAllowFailoverCmd(client *c) { + if (!server.cluster_enabled) { + return 1; + } + addReplyError(c,"FAILOVER not allowed in cluster mode. " + "Use CLUSTER FAILOVER command instead."); + return 0; +} + +void clusterPromoteSelfToMaster(void) { + replicationUnsetMaster(); +} diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h new file mode 100644 index 000000000..578b46fc3 --- /dev/null +++ b/src/cluster_legacy.h @@ -0,0 +1,360 @@ +#ifndef CLUSTER_LEGACY_H +#define CLUSTER_LEGACY_H + +#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ + +/* The following defines are amount of time, sometimes expressed as + * multiplicators of the node timeout value (when ending with MULT). */ +#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ +#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ +#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ +#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ +#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ + +/* Reasons why a slave is not able to failover. */ +#define CLUSTER_CANT_FAILOVER_NONE 0 +#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 +#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 +#define CLUSTER_CANT_FAILOVER_EXPIRED 3 +#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 +#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ + +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) +#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) + +/* clusterLink encapsulates everything needed to talk with a remote node. */ +typedef struct clusterLink { + mstime_t ctime; /* Link creation time */ + connection *conn; /* Connection to remote node */ + list *send_msg_queue; /* List of messages to be sent */ + size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */ + unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */ + char *rcvbuf; /* Packet reception buffer */ + size_t rcvbuf_len; /* Used size of rcvbuf */ + size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ + clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ + int inbound; /* 1 if this link is an inbound link accepted from the related node */ +} clusterLink; + +/* Cluster node flags and macros. */ +#define CLUSTER_NODE_MASTER 1 /* The node is a master */ +#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ +#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ +#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ +#define CLUSTER_NODE_MYSELF 16 /* This node is myself */ +#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ +#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ +#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ +#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */ +#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ +#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + +#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) +#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) +#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) +#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) +#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) + +/* This structure represent elements of node->fail_reports. */ +typedef struct clusterNodeFailReport { + clusterNode *node; /* Node reporting the failure condition. */ + mstime_t time; /* Time of the last report from this node. */ +} clusterNodeFailReport; + +/* Redis cluster messages header */ + +/* Message types. + * + * Note that the PING, PONG and MEET messages are actually the same exact + * kind of packet. PONG is the reply to ping, in the exact format as a PING, + * while MEET is a special PING that forces the receiver to add the sender + * as a node (if it is not already in the list). */ +#define CLUSTERMSG_TYPE_PING 0 /* Ping */ +#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ +#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ +#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ +#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ +#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ +#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ +#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ +#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ +#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ + +/* Initially we don't know our "name", but we'll find it once we connect + * to the first node, using the getsockname() function. Then we'll use this + * address for all the next messages. */ +typedef struct { + char nodename[CLUSTER_NAMELEN]; + uint32_t ping_sent; + uint32_t pong_received; + char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ + uint16_t port; /* primary port last time it was seen */ + uint16_t cport; /* cluster port last time it was seen */ + uint16_t flags; /* node->flags copy */ + uint16_t pport; /* secondary port last time it was seen */ + uint16_t notused1; +} clusterMsgDataGossip; + +typedef struct { + char nodename[CLUSTER_NAMELEN]; +} clusterMsgDataFail; + +typedef struct { + uint32_t channel_len; + uint32_t message_len; + unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ +} clusterMsgDataPublish; + +typedef struct { + uint64_t configEpoch; /* Config epoch of the specified instance. */ + char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ + unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ +} clusterMsgDataUpdate; + +typedef struct { + uint64_t module_id; /* ID of the sender module. */ + uint32_t len; /* ID of the sender module. */ + uint8_t type; /* Type from 0 to 255. */ + unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ +} clusterMsgModule; + +/* The cluster supports optional extension messages that can be sent + * along with ping/pong/meet messages to give additional info in a + * consistent manner. */ +typedef enum { + CLUSTERMSG_EXT_TYPE_HOSTNAME, + CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, + CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, + CLUSTERMSG_EXT_TYPE_SHARDID, +} clusterMsgPingtypes; + +/* Helper function for making sure extensions are eight byte aligned. */ +#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) + +typedef struct { + char hostname[1]; /* The announced hostname, ends with \0. */ +} clusterMsgPingExtHostname; + +typedef struct { + char human_nodename[1]; /* The announced nodename, ends with \0. */ +} clusterMsgPingExtHumanNodename; + +typedef struct { + char name[CLUSTER_NAMELEN]; /* Node name. */ + uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ +} clusterMsgPingExtForgottenNode; + +static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); + +typedef struct { + char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ +} clusterMsgPingExtShardId; + +typedef struct { + uint32_t length; /* Total length of this extension message (including this header) */ + uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ + uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ + union { + clusterMsgPingExtHostname hostname; + clusterMsgPingExtHumanNodename human_nodename; + clusterMsgPingExtForgottenNode forgotten_node; + clusterMsgPingExtShardId shard_id; + } ext[]; /* Actual extension information, formatted so that the data is 8 + * byte aligned, regardless of its content. */ +} clusterMsgPingExt; + +union clusterMsgData { + /* PING, MEET and PONG */ + struct { + /* Array of N clusterMsgDataGossip structures */ + clusterMsgDataGossip gossip[1]; + /* Extension data that can optionally be sent for ping/meet/pong + * messages. We can't explicitly define them here though, since + * the gossip array isn't the real length of the gossip data. */ + } ping; + + /* FAIL */ + struct { + clusterMsgDataFail about; + } fail; + + /* PUBLISH */ + struct { + clusterMsgDataPublish msg; + } publish; + + /* UPDATE */ + struct { + clusterMsgDataUpdate nodecfg; + } update; + + /* MODULE */ + struct { + clusterMsgModule msg; + } module; +}; + +#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ + +typedef struct { + char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ + uint32_t totlen; /* Total length of this message */ + uint16_t ver; /* Protocol version, currently set to 1. */ + uint16_t port; /* Primary port number (TCP or TLS). */ + uint16_t type; /* Message type */ + uint16_t count; /* Only used for some kind of messages. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last + epoch advertised by its master if it is a + slave. */ + uint64_t offset; /* Master replication offset if node is a master or + processed replication offset if node is a slave. */ + char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ + unsigned char myslots[CLUSTER_SLOTS/8]; + char slaveof[CLUSTER_NAMELEN]; + char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ + uint16_t extensions; /* Number of extensions sent along with this packet. */ + char notused1[30]; /* 30 bytes reserved for future usage. */ + uint16_t pport; /* Secondary port number: if primary port is TCP port, this is + TLS port, and if primary port is TLS port, this is TCP port.*/ + uint16_t cport; /* Sender TCP cluster bus port */ + uint16_t flags; /* Sender node flags */ + unsigned char state; /* Cluster state from the POV of the sender */ + unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ + union clusterMsgData data; +} clusterMsg; + +/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster + * members, which can be running different versions of redis-server bits, + * especially during cluster rolling upgrades. + * + * Therefore, fields in this struct should remain at the same offset from + * release to release. The static asserts below ensures that incompatible + * changes in clusterMsg be caught at compile time. + */ + +static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); +static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); +static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); +static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); +static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); +static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); +static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); +static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); +static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); +static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); +static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); +static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); +static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); +static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); +static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); +static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); +static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); +static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); +static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); + +#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) + +/* Message flags better specify the packet content or are used to + * provide some information about the node state. */ +#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ +#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if + master is up. */ +#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ + +struct _clusterNode { + mstime_t ctime; /* Node object creation time. */ + char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ + char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ + int flags; /* CLUSTER_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ + unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ + uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ + int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ + int numslots; /* Number of slots handled by this node */ + int numslaves; /* Number of slave nodes, if this is a master */ + clusterNode **slaves; /* pointers to slave nodes */ + clusterNode *slaveof; /* pointer to the master node. Note that it + may be NULL even if the node is a slave + if we don't have the master node in our + tables. */ + unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ + mstime_t ping_sent; /* Unix time we sent latest ping */ + mstime_t pong_received; /* Unix time we received the pong */ + mstime_t data_received; /* Unix time we received any data */ + mstime_t fail_time; /* Unix time when FAIL flag was set */ + mstime_t voted_time; /* Last time we voted for a slave of this master */ + mstime_t repl_offset_time; /* Unix time we received offset for this node */ + mstime_t orphaned_time; /* Starting time of orphaned master condition */ + long long repl_offset; /* Last known repl offset for this node. */ + char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ + sds hostname; /* The known hostname for this node */ + sds human_nodename; /* The known human readable nodename for this node */ + int tcp_port; /* Latest known clients TCP port. */ + int tls_port; /* Latest known clients TLS port */ + int cport; /* Latest known cluster port of this node. */ + clusterLink *link; /* TCP/IP link established toward this node */ + clusterLink *inbound_link; /* TCP/IP link accepted from this node */ + list *fail_reports; /* List of nodes signaling this as failing */ +}; + +struct clusterState { + clusterNode *myself; /* This node */ + uint64_t currentEpoch; + int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int size; /* Num of master nodes with at least one slot */ + dict *nodes; /* Hash table of name -> clusterNode structures */ + dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ + dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ + clusterNode *migrating_slots_to[CLUSTER_SLOTS]; + clusterNode *importing_slots_from[CLUSTER_SLOTS]; + clusterNode *slots[CLUSTER_SLOTS]; + rax *slots_to_channels; + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This slave rank for current auth request. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a slave is currently not able to + failover. See the CANT_FAILOVER_* macros. */ + /* Manual failover state in common. */ + mstime_t mf_end; /* Manual failover time limit (ms unixtime). + It is zero if there is no MF in progress. */ + /* Manual failover state of master. */ + clusterNode *mf_slave; /* Slave performing the manual failover. */ + /* Manual failover state of slave. */ + long long mf_master_offset; /* Master offset the slave needs to start MF + or -1 if still not received. */ + int mf_can_start; /* If non-zero signal that the manual failover + can start requesting masters vote. */ + /* The following fields are used by masters to take state on elections. */ + uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ + /* Stats */ + /* Messages received and sent by type. */ + long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; + long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; + long long stats_pfail_nodes; /* Number of nodes in PFAIL status, + excluding nodes without address. */ + unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ + + /* Bit map for slots that are no longer claimed by the owner in cluster PING + * messages. During slot migration, the owner will stop claiming the slot after + * the ownership transfer. Set the bit corresponding to the slot when a node + * stops claiming the slot. This prevents spreading incorrect information (that + * source still owns the slot) using UPDATE messages. */ + unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; +}; + + +#endif //CLUSTER_LEGACY_H diff --git a/src/db.c b/src/db.c index c4c22907e..a369c9a9a 100644 --- a/src/db.c +++ b/src/db.c @@ -2197,7 +2197,7 @@ int dbExpand(const redisDb *db, uint64_t db_size, dbKeyType keyType, int try_exp dict *d; if (server.cluster_enabled) { for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (clusterNodeGetSlotBit(server.cluster->myself, i)) { + if (clusterNodeCoversSlot(getMyClusterNode(), i)) { /* We don't know exact number of keys that would fall into each slot, but we can approximate it, assuming even distribution. */ if (keyType == DB_MAIN) { d = db->dict[i]; diff --git a/src/debug.c b/src/debug.c index 1d1176359..26cb98f1e 100644 --- a/src/debug.c +++ b/src/debug.c @@ -496,11 +496,9 @@ void debugCommand(client *c) { " In case RESET is provided the peak reset time will be restored to the default value", "REPLYBUFFER RESIZING <0|1>", " Enable or disable the reply buffer resize cron job", -"CLUSTERLINK KILL ", -" Kills the link based on the direction to/from (both) with the provided node." , NULL }; - addReplyHelp(c, help); + addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp()); } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) { /* Compiler gives warnings about writing to a random address * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area @@ -1023,34 +1021,7 @@ NULL return; } addReply(c, shared.ok); - } else if(!strcasecmp(c->argv[1]->ptr,"CLUSTERLINK") && - !strcasecmp(c->argv[2]->ptr,"KILL") && - c->argc == 5) { - if (!server.cluster_enabled) { - addReplyError(c, "Debug option only available for cluster mode enabled setup!"); - return; - } - - /* Find the node. */ - clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[4]->ptr); - return; - } - - /* Terminate the link based on the direction or all. */ - if (!strcasecmp(c->argv[3]->ptr,"from")) { - freeClusterLink(n->inbound_link); - } else if (!strcasecmp(c->argv[3]->ptr,"to")) { - freeClusterLink(n->link); - } else if (!strcasecmp(c->argv[3]->ptr,"all")) { - freeClusterLink(n->link); - freeClusterLink(n->inbound_link); - } else { - addReplyErrorFormat(c, "Unknown direction %s", (char*) c->argv[3]->ptr); - } - addReply(c,shared.ok); - } else { + } else if(!handleDebugClusterCommand(c)) { addReplySubcommandSyntaxError(c); return; } diff --git a/src/module.c b/src/module.c index 0428ac59c..b33192e08 100644 --- a/src/module.c +++ b/src/module.c @@ -6466,7 +6466,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch c->flags &= ~(CLIENT_READONLY|CLIENT_ASKING); c->flags |= ctx->client->flags & (CLIENT_READONLY|CLIENT_ASKING); if (getNodeByQuery(c,c->cmd,c->argv,c->argc,NULL,&error_code) != - server.cluster->myself) + getMyClusterNode()) { sds msg = NULL; if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { @@ -8917,23 +8917,7 @@ char **RM_GetClusterNodesList(RedisModuleCtx *ctx, size_t *numnodes) { UNUSED(ctx); if (!server.cluster_enabled) return NULL; - size_t count = dictSize(server.cluster->nodes); - char **ids = zmalloc((count+1)*REDISMODULE_NODE_ID_LEN); - dictIterator *di = dictGetIterator(server.cluster->nodes); - dictEntry *de; - int j = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; - ids[j] = zmalloc(REDISMODULE_NODE_ID_LEN); - memcpy(ids[j],node->name,REDISMODULE_NODE_ID_LEN); - j++; - } - *numnodes = j; - ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need - * to also get the count argument. */ - dictReleaseIterator(di); - return ids; + return getClusterNodesList(numnodes); } /* Free the node list obtained with RedisModule_GetClusterNodesList. */ @@ -8947,7 +8931,7 @@ void RM_FreeClusterNodesList(char **ids) { * is disabled. */ const char *RM_GetMyClusterID(void) { if (!server.cluster_enabled) return NULL; - return server.cluster->myself->name; + return getMyClusterId(); } /* Return the number of nodes in the cluster, regardless of their state @@ -8956,7 +8940,7 @@ const char *RM_GetMyClusterID(void) { * cluster mode, zero is returned. */ size_t RM_GetClusterSize(void) { if (!server.cluster_enabled) return 0; - return dictSize(server.cluster->nodes); + return getClusterSize(); } /* Populate the specified info for the node having as ID the specified 'id', @@ -8983,20 +8967,19 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m UNUSED(ctx); clusterNode *node = clusterLookupNode(id, strlen(id)); - if (node == NULL || - node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) + if (node == NULL || clusterNodePending(node)) { return REDISMODULE_ERR; } - if (ip) redis_strlcpy(ip,node->ip,NET_IP_STR_LEN); + if (ip) redis_strlcpy(ip, clusterNodeIp(node),NET_IP_STR_LEN); if (master_id) { /* If the information is not available, the function will set the * field to zero bytes, so that when the field can't be populated the * function kinda remains predictable. */ - if (node->flags & CLUSTER_NODE_SLAVE && node->slaveof) - memcpy(master_id,node->slaveof->name,REDISMODULE_NODE_ID_LEN); + if (clusterNodeIsSlave(node) && clusterNodeGetSlaveof(node)) + memcpy(master_id, clusterNodeGetName(clusterNodeGetSlaveof(node)) ,REDISMODULE_NODE_ID_LEN); else memset(master_id,0,REDISMODULE_NODE_ID_LEN); } @@ -9006,12 +8989,12 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m * we can provide binary compatibility. */ if (flags) { *flags = 0; - if (node->flags & CLUSTER_NODE_MYSELF) *flags |= REDISMODULE_NODE_MYSELF; - if (node->flags & CLUSTER_NODE_MASTER) *flags |= REDISMODULE_NODE_MASTER; - if (node->flags & CLUSTER_NODE_SLAVE) *flags |= REDISMODULE_NODE_SLAVE; - if (node->flags & CLUSTER_NODE_PFAIL) *flags |= REDISMODULE_NODE_PFAIL; - if (node->flags & CLUSTER_NODE_FAIL) *flags |= REDISMODULE_NODE_FAIL; - if (node->flags & CLUSTER_NODE_NOFAILOVER) *flags |= REDISMODULE_NODE_NOFAILOVER; + if (clusterNodeIsMyself(node)) *flags |= REDISMODULE_NODE_MYSELF; + if (clusterNodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER; + if (clusterNodeIsSlave(node)) *flags |= REDISMODULE_NODE_SLAVE; + if (clusterNodeTimedOut(node)) *flags |= REDISMODULE_NODE_PFAIL; + if (clusterNodeIsFailing(node)) *flags |= REDISMODULE_NODE_FAIL; + if (clusterNodeIsNoFailover(node)) *flags |= REDISMODULE_NODE_NOFAILOVER; } return REDISMODULE_OK; } diff --git a/src/networking.c b/src/networking.c index 718271aec..847eee3d5 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1117,14 +1117,18 @@ void addReplyVerbatim(client *c, const char *s, size_t len, const char *ext) { } } -/* Add an array of C strings as status replies with a heading. - * This function is typically invoked by from commands that support - * subcommands in response to the 'help' subcommand. The help array - * is terminated by NULL sentinel. */ -void addReplyHelp(client *c, const char **help) { +/* This function is similar to the addReplyHelp function but adds the + * ability to pass in two arrays of strings. Some commands have + * some additional subcommands based on the specific feature implementation + * Redis is compiled with (currently just clustering). This function allows + * to pass is the common subcommands in `help` and any implementation + * specific subcommands in `extended_help`. + */ +void addExtendedReplyHelp(client *c, const char **help, const char **extended_help) { sds cmd = sdsnew((char*) c->argv[0]->ptr); void *blenp = addReplyDeferredLen(c); int blen = 0; + int idx = 0; sdstoupper(cmd); addReplyStatusFormat(c, @@ -1132,6 +1136,10 @@ void addReplyHelp(client *c, const char **help) { sdsfree(cmd); while (help[blen]) addReplyStatus(c,help[blen++]); + if (extended_help) { + while (extended_help[idx]) addReplyStatus(c,extended_help[idx++]); + } + blen += idx; addReplyStatus(c,"HELP"); addReplyStatus(c," Print this help."); @@ -1141,6 +1149,14 @@ void addReplyHelp(client *c, const char **help) { setDeferredArrayLen(c,blenp,blen); } +/* Add an array of C strings as status replies with a heading. + * This function is typically invoked by commands that support + * subcommands in response to the 'help' subcommand. The help array + * is terminated by NULL sentinel. */ +void addReplyHelp(client *c, const char **help) { + addExtendedReplyHelp(c, help, NULL); +} + /* Add a suggestive error reply. * This function is typically invoked by from commands that support * subcommands in response to an unknown subcommand or argument error. */ diff --git a/src/replication.c b/src/replication.c index 313f69152..e4b7c42e1 100644 --- a/src/replication.c +++ b/src/replication.c @@ -951,7 +951,11 @@ void syncCommand(client *c) { } if (!strcasecmp(c->argv[1]->ptr,server.replid)) { - replicationUnsetMaster(); + if (server.cluster_enabled) { + clusterPromoteSelfToMaster(); + } else { + replicationUnsetMaster(); + } sds client = catClientInfoString(sdsempty(),c); serverLog(LL_NOTICE, "MASTER MODE enabled (failover request from '%s')",client); @@ -3774,7 +3778,7 @@ void replicationCron(void) { * match the one stored into 'mf_master_offset' state. */ int manual_failover_in_progress = ((server.cluster_enabled && - server.cluster->mf_end) || + clusterManualFailoverTimeLimit()) || server.failover_end_time) && isPausedActionsWithUpdate(PAUSE_ACTION_REPLICA); @@ -4061,12 +4065,10 @@ void abortFailover(const char *err) { * will attempt forever and must be manually aborted. */ void failoverCommand(client *c) { - if (server.cluster_enabled) { - addReplyError(c,"FAILOVER not allowed in cluster mode. " - "Use CLUSTER FAILOVER command instead."); + if (!clusterAllowFailoverCmd(c)) { return; } - + /* Handle special case for abort */ if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) { if (server.failover_state == NO_FAILOVER) { diff --git a/src/script.c b/src/script.c index d0b9b9635..678773d96 100644 --- a/src/script.c +++ b/src/script.c @@ -429,7 +429,7 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or c->flags &= ~(CLIENT_READONLY | CLIENT_ASKING); c->flags |= original_c->flags & (CLIENT_READONLY | CLIENT_ASKING); int hashslot = -1; - if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != server.cluster->myself) { + if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != getMyClusterNode()) { if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { *err = sdsnew( "Script attempted to execute a write command while the " diff --git a/src/server.c b/src/server.c index e63a2ffff..454bd969d 100644 --- a/src/server.c +++ b/src/server.c @@ -4037,7 +4037,7 @@ int processCommand(client *c) { int error_code; clusterNode *n = getNodeByQuery(c,c->cmd,c->argv,c->argc, &c->slot,&error_code); - if (n == NULL || n != server.cluster->myself) { + if (n == NULL || !clusterNodeIsMyself(n)) { if (c->cmd->proc == execCommand) { discardTransaction(c); } else { @@ -6838,7 +6838,7 @@ int redisIsSupervised(int mode) { int iAmMaster(void) { return ((!server.cluster_enabled && server.masterhost == NULL) || - (server.cluster_enabled && nodeIsMaster(server.cluster->myself))); + (server.cluster_enabled && clusterNodeIsMaster(getMyClusterNode()))); } #ifdef REDIS_TEST @@ -7161,7 +7161,7 @@ int main(int argc, char **argv) { ACLLoadUsersAtStartup(); initListeners(); if (server.cluster_enabled) { - clusterInitListeners(); + clusterInitLast(); } InitServerLast(); diff --git a/src/server.h b/src/server.h index 902050889..66fb288aa 100644 --- a/src/server.h +++ b/src/server.h @@ -738,6 +738,7 @@ struct RedisModuleCtx; struct moduleLoadQueueEntry; struct RedisModuleKeyOptCtx; struct RedisModuleCommand; +struct clusterState; /* Each module type implementation should export a set of methods in order * to serialize and deserialize the value in the RDB file, rewrite the AOF @@ -2626,6 +2627,7 @@ void addReplySetLen(client *c, long length); void addReplyAttributeLen(client *c, long length); void addReplyPushLen(client *c, long length); void addReplyHelp(client *c, const char **help); +void addExtendedReplyHelp(client *c, const char **help, const char **extended_help); void addReplySubcommandSyntaxError(client *c); void addReplyLoadedModules(client *c); void copyReplicaOutputBuffer(client *dst, client *src);