2013-10-09 09:37:20 -04:00
|
|
|
#ifndef __REDIS_CLUSTER_H
|
|
|
|
#define __REDIS_CLUSTER_H
|
|
|
|
|
|
|
|
/*-----------------------------------------------------------------------------
|
|
|
|
* Redis cluster data structures, defines, exported API.
|
|
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
#define REDIS_CLUSTER_SLOTS 16384
|
|
|
|
#define REDIS_CLUSTER_OK 0 /* Everything looks ok */
|
|
|
|
#define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */
|
|
|
|
#define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */
|
|
|
|
#define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
|
|
|
|
|
|
|
|
/* The following defines are amunt of time, sometimes expressed as
|
|
|
|
* multiplicators of the node timeout value (when ending with MULT). */
|
2013-10-09 10:18:33 -04:00
|
|
|
#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15000
|
2014-05-22 10:57:47 -04:00
|
|
|
#define REDIS_CLUSTER_DEFAULT_SLAVE_VALIDITY 10 /* Slave max data age factor. */
|
2013-10-09 09:37:20 -04:00
|
|
|
#define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
|
|
|
|
#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
|
|
|
|
#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
|
|
|
|
#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */
|
2014-01-31 05:12:34 -05:00
|
|
|
#define REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER 1
|
2014-02-05 07:01:24 -05:00
|
|
|
#define REDIS_CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
|
|
|
|
#define REDIS_CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
|
2013-10-09 09:37:20 -04:00
|
|
|
|
2014-03-07 07:19:09 -05:00
|
|
|
/* Redirection errors returned by getNodeByQuery(). */
|
|
|
|
#define REDIS_CLUSTER_REDIR_NONE 0 /* Node can serve the request. */
|
|
|
|
#define REDIS_CLUSTER_REDIR_CROSS_SLOT 1 /* Keys in different slots. */
|
|
|
|
#define REDIS_CLUSTER_REDIR_UNSTABLE 2 /* Keys in slot resharding. */
|
|
|
|
#define REDIS_CLUSTER_REDIR_ASK 3 /* -ASK redirection required. */
|
|
|
|
#define REDIS_CLUSTER_REDIR_MOVED 4 /* -MOVED redirection required. */
|
|
|
|
|
2013-10-09 09:37:20 -04:00
|
|
|
struct clusterNode;
|
|
|
|
|
|
|
|
/* clusterLink encapsulates everything needed to talk with a remote node. */
|
|
|
|
typedef struct clusterLink {
|
2013-10-09 10:18:33 -04:00
|
|
|
mstime_t ctime; /* Link creation time */
|
2013-10-09 09:37:20 -04:00
|
|
|
int fd; /* TCP socket file descriptor */
|
|
|
|
sds sndbuf; /* Packet send buffer */
|
|
|
|
sds rcvbuf; /* Packet reception buffer */
|
|
|
|
struct clusterNode *node; /* Node related to this link if any, or NULL */
|
|
|
|
} clusterLink;
|
|
|
|
|
2014-01-29 06:17:16 -05:00
|
|
|
/* Cluster node flags and macros. */
|
2013-10-09 09:37:20 -04:00
|
|
|
#define REDIS_NODE_MASTER 1 /* The node is a master */
|
|
|
|
#define REDIS_NODE_SLAVE 2 /* The node is a slave */
|
|
|
|
#define REDIS_NODE_PFAIL 4 /* Failure? Need acknowledge */
|
|
|
|
#define REDIS_NODE_FAIL 8 /* The node is believed to be malfunctioning */
|
|
|
|
#define REDIS_NODE_MYSELF 16 /* This node is myself */
|
|
|
|
#define REDIS_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
|
|
|
|
#define REDIS_NODE_NOADDR 64 /* We don't know the address of this node */
|
|
|
|
#define REDIS_NODE_MEET 128 /* Send a MEET message to this node */
|
|
|
|
#define REDIS_NODE_PROMOTED 256 /* Master was a slave propoted by failover */
|
|
|
|
#define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
|
|
|
|
|
2014-01-29 06:17:16 -05:00
|
|
|
#define nodeIsMaster(n) ((n)->flags & REDIS_NODE_MASTER)
|
|
|
|
#define nodeIsSlave(n) ((n)->flags & REDIS_NODE_SLAVE)
|
|
|
|
#define nodeInHandshake(n) ((n)->flags & REDIS_NODE_HANDSHAKE)
|
|
|
|
#define nodeHasAddr(n) (!((n)->flags & REDIS_NODE_NOADDR))
|
|
|
|
#define nodeWithoutAddr(n) ((n)->flags & REDIS_NODE_NOADDR)
|
|
|
|
#define nodeTimedOut(n) ((n)->flags & REDIS_NODE_PFAIL)
|
|
|
|
#define nodeFailed(n) ((n)->flags & REDIS_NODE_FAIL)
|
|
|
|
|
2013-10-09 09:37:20 -04:00
|
|
|
/* This structure represent elements of node->fail_reports. */
|
2014-08-25 04:42:18 -04:00
|
|
|
typedef struct clusterNodeFailReport {
|
2013-10-09 09:37:20 -04:00
|
|
|
struct clusterNode *node; /* Node reporting the failure condition. */
|
2013-10-09 10:18:33 -04:00
|
|
|
mstime_t time; /* Time of the last report from this node. */
|
2014-08-25 04:42:18 -04:00
|
|
|
} clusterNodeFailReport;
|
2013-10-09 09:37:20 -04:00
|
|
|
|
2014-08-25 04:42:18 -04:00
|
|
|
typedef struct clusterNode {
|
2013-10-09 10:18:33 -04:00
|
|
|
mstime_t ctime; /* Node object creation time. */
|
2013-10-09 09:37:20 -04:00
|
|
|
char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
|
|
|
|
int flags; /* REDIS_NODE_... */
|
|
|
|
uint64_t configEpoch; /* Last configEpoch observed for this node */
|
|
|
|
unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */
|
|
|
|
int numslots; /* Number of slots handled by this node */
|
|
|
|
int numslaves; /* Number of slave nodes, if this is a master */
|
|
|
|
struct clusterNode **slaves; /* pointers to slave nodes */
|
|
|
|
struct clusterNode *slaveof; /* pointer to the master node */
|
2014-01-28 10:28:07 -05:00
|
|
|
mstime_t ping_sent; /* Unix time we sent latest ping */
|
|
|
|
mstime_t pong_received; /* Unix time we received the pong */
|
|
|
|
mstime_t fail_time; /* Unix time when FAIL flag was set */
|
|
|
|
mstime_t voted_time; /* Last time we voted for a slave of this master */
|
|
|
|
mstime_t repl_offset_time; /* Unix time we received offset for this node */
|
|
|
|
long long repl_offset; /* Last known repl offset for this node. */
|
2013-10-09 09:37:20 -04:00
|
|
|
char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */
|
|
|
|
int port; /* Latest known port of this node */
|
|
|
|
clusterLink *link; /* TCP/IP link with this node */
|
|
|
|
list *fail_reports; /* List of nodes signaling this as failing */
|
2014-08-25 04:42:18 -04:00
|
|
|
} clusterNode;
|
2013-10-09 09:37:20 -04:00
|
|
|
|
|
|
|
typedef struct clusterState {
|
|
|
|
clusterNode *myself; /* This node */
|
|
|
|
uint64_t currentEpoch;
|
|
|
|
int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */
|
|
|
|
int size; /* Num of master nodes with at least one slot */
|
|
|
|
dict *nodes; /* Hash table of name -> clusterNode structures */
|
2013-11-29 11:37:06 -05:00
|
|
|
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
|
2013-10-09 09:37:20 -04:00
|
|
|
clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS];
|
|
|
|
clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS];
|
|
|
|
clusterNode *slots[REDIS_CLUSTER_SLOTS];
|
|
|
|
zskiplist *slots_to_keys;
|
|
|
|
/* The following fields are used to take the slave state on elections. */
|
2013-11-29 10:17:05 -05:00
|
|
|
mstime_t failover_auth_time; /* Time of previous or next election. */
|
2013-10-09 09:37:20 -04:00
|
|
|
int failover_auth_count; /* Number of votes received so far. */
|
|
|
|
int failover_auth_sent; /* True if we already asked for votes. */
|
2014-01-29 10:51:11 -05:00
|
|
|
int failover_auth_rank; /* This slave rank for current auth request. */
|
2013-10-09 09:37:20 -04:00
|
|
|
uint64_t failover_auth_epoch; /* Epoch of the current election. */
|
2014-02-05 07:01:24 -05:00
|
|
|
/* Manual failover state in common. */
|
|
|
|
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
|
|
|
|
It is zero if there is no MF in progress. */
|
|
|
|
/* Manual failover state of master. */
|
|
|
|
clusterNode *mf_slave; /* Slave performing the manual failover. */
|
|
|
|
/* Manual failover state of slave. */
|
|
|
|
long long mf_master_offset; /* Master offset the slave needs to start MF
|
|
|
|
or zero if stil not received. */
|
|
|
|
int mf_can_start; /* If non-zero signal that the manual failover
|
|
|
|
can start requesting masters vote. */
|
2013-10-09 09:37:20 -04:00
|
|
|
/* The followign fields are uesd by masters to take state on elections. */
|
2014-03-27 10:01:24 -04:00
|
|
|
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
|
2013-10-09 09:37:20 -04:00
|
|
|
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
|
|
|
|
long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */
|
2013-11-29 10:17:05 -05:00
|
|
|
long long stats_bus_messages_received; /* Num of msg rcvd via cluster bus.*/
|
2013-10-09 09:37:20 -04:00
|
|
|
} clusterState;
|
|
|
|
|
|
|
|
/* clusterState todo_before_sleep flags. */
|
|
|
|
#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
|
|
|
|
#define CLUSTER_TODO_UPDATE_STATE (1<<1)
|
|
|
|
#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
|
|
|
|
#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
|
|
|
|
|
|
|
|
/* Redis cluster messages header */
|
|
|
|
|
|
|
|
/* Note that the PING, PONG and MEET messages are actually the same exact
|
|
|
|
* kind of packet. PONG is the reply to ping, in the exact format as a PING,
|
|
|
|
* while MEET is a special PING that forces the receiver to add the sender
|
|
|
|
* as a node (if it is not already in the list). */
|
|
|
|
#define CLUSTERMSG_TYPE_PING 0 /* Ping */
|
|
|
|
#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */
|
|
|
|
#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */
|
|
|
|
#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */
|
|
|
|
#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */
|
|
|
|
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
|
2013-11-08 10:26:50 -05:00
|
|
|
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */
|
|
|
|
#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */
|
2014-02-05 07:01:24 -05:00
|
|
|
#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */
|
2013-10-09 09:37:20 -04:00
|
|
|
|
|
|
|
/* Initially we don't know our "name", but we'll find it once we connect
|
|
|
|
* to the first node, using the getsockname() function. Then we'll use this
|
|
|
|
* address for all the next messages. */
|
|
|
|
typedef struct {
|
|
|
|
char nodename[REDIS_CLUSTER_NAMELEN];
|
|
|
|
uint32_t ping_sent;
|
|
|
|
uint32_t pong_received;
|
2014-02-25 16:02:28 -05:00
|
|
|
char ip[REDIS_IP_STR_LEN]; /* IP address last time it was seen */
|
2013-10-09 09:37:20 -04:00
|
|
|
uint16_t port; /* port last time it was seen */
|
|
|
|
uint16_t flags;
|
|
|
|
uint32_t notused; /* for 64 bit alignment */
|
|
|
|
} clusterMsgDataGossip;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
char nodename[REDIS_CLUSTER_NAMELEN];
|
|
|
|
} clusterMsgDataFail;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
uint32_t channel_len;
|
|
|
|
uint32_t message_len;
|
|
|
|
unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */
|
|
|
|
} clusterMsgDataPublish;
|
|
|
|
|
2013-11-08 10:26:50 -05:00
|
|
|
typedef struct {
|
|
|
|
uint64_t configEpoch; /* Config epoch of the specified instance. */
|
|
|
|
char nodename[REDIS_CLUSTER_NAMELEN]; /* Name of the slots owner. */
|
|
|
|
unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* Slots bitmap. */
|
|
|
|
} clusterMsgDataUpdate;
|
|
|
|
|
2013-10-09 09:37:20 -04:00
|
|
|
union clusterMsgData {
|
|
|
|
/* PING, MEET and PONG */
|
|
|
|
struct {
|
|
|
|
/* Array of N clusterMsgDataGossip structures */
|
|
|
|
clusterMsgDataGossip gossip[1];
|
|
|
|
} ping;
|
|
|
|
|
|
|
|
/* FAIL */
|
|
|
|
struct {
|
|
|
|
clusterMsgDataFail about;
|
|
|
|
} fail;
|
|
|
|
|
|
|
|
/* PUBLISH */
|
|
|
|
struct {
|
|
|
|
clusterMsgDataPublish msg;
|
|
|
|
} publish;
|
2013-11-08 10:26:50 -05:00
|
|
|
|
|
|
|
/* UPDATE */
|
|
|
|
struct {
|
|
|
|
clusterMsgDataUpdate nodecfg;
|
|
|
|
} update;
|
2013-10-09 09:37:20 -04:00
|
|
|
};
|
|
|
|
|
2014-02-05 07:01:24 -05:00
|
|
|
|
2013-10-09 09:37:20 -04:00
|
|
|
typedef struct {
|
2014-02-10 09:55:21 -05:00
|
|
|
char sig[4]; /* Siganture "RCmb" (Redis Cluster message bus). */
|
2014-02-10 10:27:33 -05:00
|
|
|
uint32_t totlen; /* Total length of this message */
|
2014-02-10 09:53:09 -05:00
|
|
|
uint16_t ver; /* Protocol version, currently set to 0. */
|
|
|
|
uint16_t notused0; /* 2 bytes not used. */
|
2013-10-09 09:37:20 -04:00
|
|
|
uint16_t type; /* Message type */
|
|
|
|
uint16_t count; /* Only used for some kind of messages. */
|
|
|
|
uint64_t currentEpoch; /* The epoch accordingly to the sending node. */
|
2014-01-28 10:34:23 -05:00
|
|
|
uint64_t configEpoch; /* The config epoch if it's a master, or the last
|
|
|
|
epoch advertised by its master if it is a
|
|
|
|
slave. */
|
2014-01-28 10:51:50 -05:00
|
|
|
uint64_t offset; /* Master replication offset if node is a master or
|
|
|
|
processed replication offset if node is a slave. */
|
2013-10-09 09:37:20 -04:00
|
|
|
char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */
|
|
|
|
unsigned char myslots[REDIS_CLUSTER_SLOTS/8];
|
|
|
|
char slaveof[REDIS_CLUSTER_NAMELEN];
|
|
|
|
char notused1[32]; /* 32 bytes reserved for future usage. */
|
|
|
|
uint16_t port; /* Sender TCP base port */
|
|
|
|
uint16_t flags; /* Sender node flags */
|
|
|
|
unsigned char state; /* Cluster state from the POV of the sender */
|
2014-02-05 07:01:24 -05:00
|
|
|
unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
|
2013-10-09 09:37:20 -04:00
|
|
|
union clusterMsgData data;
|
|
|
|
} clusterMsg;
|
|
|
|
|
|
|
|
#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
|
|
|
|
|
2014-02-05 07:01:24 -05:00
|
|
|
/* Message flags better specify the packet content or are used to
|
|
|
|
* provide some information about the node state. */
|
|
|
|
#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
|
2014-02-05 07:10:03 -05:00
|
|
|
#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
|
|
|
|
master is up. */
|
2014-02-05 07:01:24 -05:00
|
|
|
|
2014-01-28 10:34:23 -05:00
|
|
|
/* ---------------------- API exported outside cluster.c -------------------- */
|
2013-10-09 09:37:20 -04:00
|
|
|
clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
|
|
|
|
|
|
|
|
#endif /* __REDIS_CLUSTER_H */
|