Fix occasional hangs on replication reconnection. (#7830)

This happens only on diskless replicas when attempting to reconnect after 
failing to load an RDB file. It is more likely to occur with larger datasets.

After reconnection is initiated, replicationEmptyDbCallback() may get called 
and try to write to an unconnected socket. This triggered another issue where
the connection is put into an error state and the connect handler never gets
called. The problem is a regression introduced by commit c17e597.
This commit is contained in:
Yossi Gottlieb 2020-09-22 11:38:52 +03:00 committed by GitHub
parent 63a05dde46
commit 1980f639b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 3 deletions

View File

@ -168,6 +168,11 @@ static int connSocketWrite(connection *conn, const void *data, size_t data_len)
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
@ -180,6 +185,11 @@ static int connSocketRead(connection *conn, void *buf, size_t buf_len) {
conn->state = CONN_STATE_CLOSED;
} else if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}

View File

@ -1361,6 +1361,7 @@ void replicationSendNewlineToMaster(void) {
* the new dataset received by the master. */
void replicationEmptyDbCallback(void *privdata) {
UNUSED(privdata);
if (server.repl_state == REPL_STATE_TRANSFER)
replicationSendNewlineToMaster();
}