Fixed Sentinel support for hostnames (#10146)

Sentinel tries to resolve instances hostname to IP only during registration. 
It might be that the instance is unavailable during that time, such as 
leader crashed and failover took place. Yet, promoted replica must support:

 - Register leader, even if it fails to resolve its hostname during failover
 - Try later to resolve it, if instance is disconnected. Note that
   this condition also support ip-change of an instance.
This commit is contained in:
Moti Cohen 2022-01-29 21:00:29 +02:00 committed by GitHub
parent 2d391e62bd
commit 79f089bdd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -552,10 +552,10 @@ void sentinelIsRunning(void) {
/* Create a sentinelAddr object and return it on success.
* On error NULL is returned and errno is set to:
* ENOENT: Can't resolve the hostname.
* ENOENT: Can't resolve the hostname, unless accept_unresolved is non-zero.
* EINVAL: Invalid port number.
*/
sentinelAddr *createSentinelAddr(char *hostname, int port) {
sentinelAddr *createSentinelAddr(char *hostname, int port, int is_accept_unresolved) {
char ip[NET_IP_STR_LEN];
sentinelAddr *sa;
@ -565,9 +565,15 @@ sentinelAddr *createSentinelAddr(char *hostname, int port) {
}
if (anetResolve(NULL,hostname,ip,sizeof(ip),
sentinel.resolve_hostnames ? ANET_NONE : ANET_IP_ONLY) == ANET_ERR) {
serverLog(LL_WARNING, "Failed to resolve hostname '%s'", hostname);
if (sentinel.resolve_hostnames && is_accept_unresolved) {
ip[0] = '\0';
}
else {
errno = ENOENT;
return NULL;
}
}
sa = zmalloc(sizeof(*sa));
sa->hostname = sdsnew(hostname);
sa->ip = sdsnew(ip);
@ -598,6 +604,15 @@ int sentinelAddrIsEqual(sentinelAddr *a, sentinelAddr *b) {
return a->port == b->port && !strcasecmp(a->ip,b->ip);
}
/* Return non-zero if the two addresses are equal, either by address
* or by hostname if they could not have been resolved.
*/
int sentinelAddrOrHostnameEqual(sentinelAddr *a, sentinelAddr *b) {
return a->port == b->port &&
(!strcmp(a->ip, b->ip) ||
!strcasecmp(a->hostname, b->hostname));
}
/* Return non-zero if a hostname matches an address. */
int sentinelAddrEqualsHostname(sentinelAddr *a, char *hostname) {
char ip[NET_IP_STR_LEN];
@ -1259,7 +1274,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *
serverAssert((flags & SRI_MASTER) || master != NULL);
/* Check address validity. */
addr = createSentinelAddr(hostname,port);
addr = createSentinelAddr(hostname,port,1);
if (addr == NULL) return NULL;
/* For slaves use ip/host:port as name. */
@ -1387,7 +1402,7 @@ sentinelRedisInstance *sentinelRedisInstanceLookupSlave(
* If that is the case, depending on configuration we either resolve
* it and use the IP address or fail.
*/
addr = createSentinelAddr(slave_addr, port);
addr = createSentinelAddr(slave_addr, port, 0);
if (!addr) return NULL;
key = announceSentinelAddrAndPort(addr);
releaseSentinelAddr(addr);
@ -1450,8 +1465,10 @@ sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, c
serverAssert(addr || runid); /* User must pass at least one search param. */
if (addr != NULL) {
/* Resolve addr, we use the IP as a key even if a hostname is used */
ri_addr = createSentinelAddr(addr, port);
/* Try to resolve addr. If hostnames are used, we're accepting an ri_addr
* that contains an hostname only and can still be matched based on that.
*/
ri_addr = createSentinelAddr(addr,port,1);
if (!ri_addr) return NULL;
}
di = dictGetIterator(instances);
@ -1460,8 +1477,7 @@ sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, c
if (runid && !ri->runid) continue;
if ((runid == NULL || strcmp(ri->runid, runid) == 0) &&
(addr == NULL || (strcmp(ri->addr->ip, ri_addr->ip) == 0 &&
ri->addr->port == port)))
(addr == NULL || sentinelAddrOrHostnameEqual(ri->addr, ri_addr)))
{
instance = ri;
break;
@ -1565,7 +1581,7 @@ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *hos
dictIterator *di;
dictEntry *de;
newaddr = createSentinelAddr(hostname,port);
newaddr = createSentinelAddr(hostname,port,0);
if (newaddr == NULL) return C_ERR;
/* There can be only 0 or 1 slave that has the newaddr.
@ -2355,6 +2371,7 @@ static int instanceLinkNegotiateTLS(redisAsyncContext *context) {
* is disconnected. Note that link->disconnected is true even if just
* one of the two links (commands and pub/sub) is missing. */
void sentinelReconnectInstance(sentinelRedisInstance *ri) {
if (ri->link->disconnected == 0) return;
if (ri->addr->port == 0) return; /* port == 0 means invalid address. */
instanceLink *link = ri->link;
@ -2365,7 +2382,21 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) {
/* Commands connection. */
if (link->cc == NULL) {
/* It might be that the instance is disconnected because it wasn't available earlier when the instance
* allocated, say during failover, and therefore we failed to resolve its ip.
* Another scenario is that the instance restarted with new ip, and we should resolve its new ip based on
* its hostname */
if (sentinel.resolve_hostnames) {
sentinelAddr *tryResolveAddr = createSentinelAddr(ri->addr->hostname, ri->addr->port, 0);
if (tryResolveAddr != NULL) {
releaseSentinelAddr(ri->addr);
ri->addr = tryResolveAddr;
}
}
link->cc = redisAsyncConnectBind(ri->addr->ip,ri->addr->port,server.bind_source_addr);
if (link->cc && !link->cc->err) anetCloexec(link->cc->c.fd);
if (!link->cc) {
sentinelEvent(LL_DEBUG,"-cmd-link-reconnection",ri,"%@ #Failed to establish connection");
@ -4892,6 +4923,7 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
max_master_down_time += master->down_after_period * 10;
di = dictGetIterator(master->slaves);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
mstime_t info_validity_time;