redict/tests/integration/psync2-master-restart.tcl

start_server {tags {"psync2 external:skip"}} {
start_server {} {
start_server {} {
    set master [srv 0 client]
    set master_host [srv 0 host]
    set master_port [srv 0 port]

    set replica [srv -1 client]
    set replica_host [srv -1 host]
    set replica_port [srv -1 port]

    set sub_replica [srv -2 client]

    # Make sure the server saves an RDB on shutdown
    $master config set save "3600 1"

    # Because we will test partial resync later, we don’t want a timeout to cause
    # the master-replica disconnect, then the extra reconnections will break the
    # sync_partial_ok stat test
    $master config set repl-timeout 3600
    $replica config set repl-timeout 3600
    $sub_replica config set repl-timeout 3600

    # Build replication chain
    $replica replicaof $master_host $master_port
    $sub_replica replicaof $replica_host $replica_port

    wait_for_condition 50 100 {
        [status $replica master_link_status] eq {up} &&
        [status $sub_replica master_link_status] eq {up}
    } else {
        fail "Replication not started."
    }

    # Avoid PINGs
    $master config set repl-ping-replica-period 3600
    $master config rewrite

    # Generate some data
    createComplexDataset $master 1000

    test "PSYNC2: Partial resync after Master restart using RDB aux fields" {
        wait_for_condition 500 100 {
            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
        } else {
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        set replid [status $master master_replid]
        set offset [status $master master_repl_offset]
        $replica config resetstat

        catch {
            # SHUTDOWN NOW ensures master doesn't send GETACK to replicas before
            # shutting down which would affect the replication offset.
            restart_server 0 true false true now
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        # Make sure master restore replication info correctly
        assert {[status $master master_replid] != $replid}
        assert {[status $master master_repl_offset] == $offset}
        assert {[status $master master_replid2] eq $replid}
        assert {[status $master second_repl_offset] == [expr $offset+1]}

        # Make sure master set replication backlog correctly
        assert {[status $master repl_backlog_active] == 1}
        assert {[status $master repl_backlog_first_byte_offset] == [expr $offset+1]}
        assert {[status $master repl_backlog_histlen] == 0}

        # Partial resync after Master restart
        assert {[status $master sync_partial_ok] == 1}
        assert {[status $replica sync_partial_ok] == 1}
    }

    test "PSYNC2: Partial resync after Master restart using RDB aux fields with expire" {
        $master debug set-active-expire 0
        for {set j 0} {$j < 1024} {incr j} {
            $master select [expr $j%16]
            $master set $j somevalue px 10
        }

        after 20

        # Wait until master has received ACK from replica. If the master thinks
        # that any replica is lagging when it shuts down, master would send
        # GETACK to the replicas, affecting the replication offset.
        set offset [status $master master_repl_offset]
        wait_for_condition 500 100 {
            [string match "*slave0:*,offset=$offset,*" [$master info replication]] &&
            $offset == [status $replica master_repl_offset] &&
            $offset == [status $sub_replica master_repl_offset]
        } else {
            show_cluster_status
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        set offset [status $master master_repl_offset]
        $replica config resetstat

        catch {
            # Unlike the test above, here we use SIGTERM, which behaves
            # differently compared to SHUTDOWN NOW if there are lagging
            # replicas. This is just to increase coverage and let each test use
            # a different shutdown approach. In this case there are no lagging
            # replicas though.
            restart_server 0 true false
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        set expired_offset [status $master repl_backlog_histlen]
        # Stale keys expired and master_repl_offset grows correctly
        assert {[status $master rdb_last_load_keys_expired] == 1024}
        assert {[status $master master_repl_offset] == [expr $offset+$expired_offset]}

        # Partial resync after Master restart
        assert {[status $master sync_partial_ok] == 1}
        assert {[status $replica sync_partial_ok] == 1}

        set digest [$master debug digest]
        assert {$digest eq [$replica debug digest]}
        assert {$digest eq [$sub_replica debug digest]}
    }

    test "PSYNC2: Full resync after Master restart when too many key expired" {
        $master config set repl-backlog-size 16384
        $master config rewrite

        $master debug set-active-expire 0
        # Make sure replication backlog is full and will be trimmed.
        for {set j 0} {$j < 2048} {incr j} {
            $master select [expr $j%16]
            $master set $j somevalue px 10
        }

        after 20

        wait_for_condition 500 100 {
            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
        } else {
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        $replica config resetstat

        catch {
            # Unlike the test above, here we use SIGTERM. This is just to
            # increase coverage and let each test use a different shutdown
            # approach.
            restart_server 0 true false
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        # Replication backlog is full
        assert {[status $master repl_backlog_first_byte_offset] > [status $master second_repl_offset]}
        assert {[status $master sync_partial_ok] == 0}
        assert {[status $master sync_full] == 1}
        assert {[status $master rdb_last_load_keys_expired] == 2048}
        assert {[status $replica sync_full] == 1}

        set digest [$master debug digest]
        assert {$digest eq [$replica debug digest]}
        assert {$digest eq [$sub_replica debug digest]}
    }
}}}
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								start_server {tags {"psync2 external:skip"}} {
 								start_server {} {
 								start_server {} {
 								    set master [srv 0 client]
 								    set master_host [srv 0 host]
 								    set master_port [srv 0 port]
 								    set replica [srv -1 client]
 								    set replica_host [srv -1 host]
 								    set replica_port [srv -1 port]
 								    set sub_replica [srv -2 client]
-												Tests: Do not save an RDB by default and add a SIGTERM default AOFRW test (#12064)

In order to speed up tests, avoid saving an RDB (mostly notable on shutdown),
except for tests that explicitly test the RDB mechanism

In addition, use `shutdown-on-sigterm force` to prevetn shutdown from failing
in case the server is in the middle of the initial AOFRW

Also a a test that checks that the `shutdown-on-sigterm default` is to refuse
shutdown if there's an initial AOFRW

Co-authored-by: Guy Benoish <guy.benoish@redislabs.com>
											
										
										
											2023-04-18 09:14:26 -04:00
+								    # Make sure the server saves an RDB on shutdown
 								    $master config set save "3600 1"
-												Fix race in PSYNC2 partial resync test (#11653)

This test sometimes fails:
```
*** [err]: PSYNC2: Partial resync after Master restart using RDB aux fields with expire in tests/integration/psync2-master-restart.tcl
Expected [status ::redis::redisHandle24 sync_partial_ok] == 1 (context: type eval line 49 cmd {assert {[status $replica sync_partial_ok] == 1}} proc ::test)
```

This is because the default repl-timeout value is 10s, sometimes the test
got timeout, then it will do a reconnect, it will incr the sync_partial_ok
counter, and then cause the test to fail.In this fix, we set the repl-timeout
to a very large number to make sure we won't get the timeout.
											
										
										
											2022-12-22 07:23:14 -05:00
+								    # Because we will test partial resync later, we don’t want a timeout to cause
 								    # the master-replica disconnect, then the extra reconnections will break the
 								    # sync_partial_ok stat test
 								    $master config set repl-timeout 3600
 								    $replica config set repl-timeout 3600
 								    $sub_replica config set repl-timeout 3600
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								    # Build replication chain
 								    $replica replicaof $master_host $master_port
 								    $sub_replica replicaof $replica_host $replica_port
 								    wait_for_condition 50 100 {
 								        [status $replica master_link_status] eq {up} &&
 								        [status $sub_replica master_link_status] eq {up}
 								    } else {
 								        fail "Replication not started."
 								    }
 								    # Avoid PINGs
 								    $master config set repl-ping-replica-period 3600
 								    $master config rewrite
 								    # Generate some data
 								    createComplexDataset $master 1000
 								    test "PSYNC2: Partial resync after Master restart using RDB aux fields" {
 								        wait_for_condition 500 100 {
 								            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
 								            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
 								        } else {
 								            fail "Replicas and master offsets were unable to match *exactly*."
 								        }
 								        set replid [status $master master_replid]
 								        set offset [status $master master_repl_offset]
 								        $replica config resetstat
 								        catch {
-												Wait for replicas when shutting down (#9872)

To avoid data loss, this commit adds a grace period for lagging replicas to
catch up the replication offset.

Done:

* Wait for replicas when shutdown is triggered by SIGTERM and SIGINT.

* Wait for replicas when shutdown is triggered by the SHUTDOWN command. A new
  blocked client type BLOCKED_SHUTDOWN is introduced, allowing multiple clients
  to call SHUTDOWN in parallel.
  Note that they don't expect a response unless an error happens and shutdown is aborted.

* Log warning for each replica lagging behind when finishing shutdown.

* CLIENT_PAUSE_WRITE while waiting for replicas.

* Configurable grace period 'shutdown-timeout' in seconds (default 10).

* New flags for the SHUTDOWN command:

    - NOW disables the grace period for lagging replicas.

    - FORCE ignores errors writing the RDB or AOF files which would normally
      prevent a shutdown.

    - ABORT cancels ongoing shutdown. Can't be combined with other flags.

* New field in the output of the INFO command: 'shutdown_in_milliseconds'. The
  value is the remaining maximum time to wait for lagging replicas before
  finishing the shutdown. This field is present in the Server section **only**
  during shutdown.

Not directly related:

* When shutting down, if there is an AOF saving child, it is killed **even** if AOF
  is disabled. This can happen if BGREWRITEAOF is used when AOF is off.

* Client pause now has end time and type (WRITE or ALL) per purpose. The
  different pause purposes are *CLIENT PAUSE command*, *failover* and
  *shutdown*. If clients are unpaused for one purpose, it doesn't affect client
  pause for other purposes. For example, the CLIENT UNPAUSE command doesn't
  affect client pause initiated by the failover or shutdown procedures. A completed
  failover or a failed shutdown doesn't unpause clients paused by the CLIENT
  PAUSE command.

Notes:

* DEBUG RESTART doesn't wait for replicas.

* We already have a warning logged when a replica disconnects. This means that
  if any replica connection is lost during the shutdown, it is either logged as
  disconnected or as lagging at the time of exit.

Co-authored-by: Oran Agra <oran@redislabs.com>
											
										
										
											2022-01-02 02:50:15 -05:00
+								            # SHUTDOWN NOW ensures master doesn't send GETACK to replicas before
 								            # shutting down which would affect the replication offset.
 								            restart_server 0 true false true now
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								            set master [srv 0 client]
 								        }
 								        wait_for_condition 50 1000 {
 								            [status $replica master_link_status] eq {up} &&
 								            [status $sub_replica master_link_status] eq {up}
 								        } else {
 								            fail "Replicas didn't sync after master restart"
 								        }
 								        # Make sure master restore replication info correctly
 								        assert {[status $master master_replid] != $replid}
 								        assert {[status $master master_repl_offset] == $offset}
 								        assert {[status $master master_replid2] eq $replid}
 								        assert {[status $master second_repl_offset] == [expr $offset+1]}
 								        # Make sure master set replication backlog correctly
 								        assert {[status $master repl_backlog_active] == 1}
 								        assert {[status $master repl_backlog_first_byte_offset] == [expr $offset+1]}
 								        assert {[status $master repl_backlog_histlen] == 0}
 								        # Partial resync after Master restart
 								        assert {[status $master sync_partial_ok] == 1}
 								        assert {[status $replica sync_partial_ok] == 1}
 								    }
 								    test "PSYNC2: Partial resync after Master restart using RDB aux fields with expire" {
 								        $master debug set-active-expire 0
 								        for {set j 0} {$j < 1024} {incr j} {
 								            $master select [expr $j%16]
 								            $master set $j somevalue px 10
 								        }
 								        after 20
-												Wait for replicas when shutting down (#9872)

To avoid data loss, this commit adds a grace period for lagging replicas to
catch up the replication offset.

Done:

* Wait for replicas when shutdown is triggered by SIGTERM and SIGINT.

* Wait for replicas when shutdown is triggered by the SHUTDOWN command. A new
  blocked client type BLOCKED_SHUTDOWN is introduced, allowing multiple clients
  to call SHUTDOWN in parallel.
  Note that they don't expect a response unless an error happens and shutdown is aborted.

* Log warning for each replica lagging behind when finishing shutdown.

* CLIENT_PAUSE_WRITE while waiting for replicas.

* Configurable grace period 'shutdown-timeout' in seconds (default 10).

* New flags for the SHUTDOWN command:

    - NOW disables the grace period for lagging replicas.

    - FORCE ignores errors writing the RDB or AOF files which would normally
      prevent a shutdown.

    - ABORT cancels ongoing shutdown. Can't be combined with other flags.

* New field in the output of the INFO command: 'shutdown_in_milliseconds'. The
  value is the remaining maximum time to wait for lagging replicas before
  finishing the shutdown. This field is present in the Server section **only**
  during shutdown.

Not directly related:

* When shutting down, if there is an AOF saving child, it is killed **even** if AOF
  is disabled. This can happen if BGREWRITEAOF is used when AOF is off.

* Client pause now has end time and type (WRITE or ALL) per purpose. The
  different pause purposes are *CLIENT PAUSE command*, *failover* and
  *shutdown*. If clients are unpaused for one purpose, it doesn't affect client
  pause for other purposes. For example, the CLIENT UNPAUSE command doesn't
  affect client pause initiated by the failover or shutdown procedures. A completed
  failover or a failed shutdown doesn't unpause clients paused by the CLIENT
  PAUSE command.

Notes:

* DEBUG RESTART doesn't wait for replicas.

* We already have a warning logged when a replica disconnects. This means that
  if any replica connection is lost during the shutdown, it is either logged as
  disconnected or as lagging at the time of exit.

Co-authored-by: Oran Agra <oran@redislabs.com>
											
										
										
											2022-01-02 02:50:15 -05:00
+								        # Wait until master has received ACK from replica. If the master thinks
 								        # that any replica is lagging when it shuts down, master would send
 								        # GETACK to the replicas, affecting the replication offset.
 								        set offset [status $master master_repl_offset]
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								        wait_for_condition 500 100 {
-												Wait for replicas when shutting down (#9872)

To avoid data loss, this commit adds a grace period for lagging replicas to
catch up the replication offset.

Done:

* Wait for replicas when shutdown is triggered by SIGTERM and SIGINT.

* Wait for replicas when shutdown is triggered by the SHUTDOWN command. A new
  blocked client type BLOCKED_SHUTDOWN is introduced, allowing multiple clients
  to call SHUTDOWN in parallel.
  Note that they don't expect a response unless an error happens and shutdown is aborted.

* Log warning for each replica lagging behind when finishing shutdown.

* CLIENT_PAUSE_WRITE while waiting for replicas.

* Configurable grace period 'shutdown-timeout' in seconds (default 10).

* New flags for the SHUTDOWN command:

    - NOW disables the grace period for lagging replicas.

    - FORCE ignores errors writing the RDB or AOF files which would normally
      prevent a shutdown.

    - ABORT cancels ongoing shutdown. Can't be combined with other flags.

* New field in the output of the INFO command: 'shutdown_in_milliseconds'. The
  value is the remaining maximum time to wait for lagging replicas before
  finishing the shutdown. This field is present in the Server section **only**
  during shutdown.

Not directly related:

* When shutting down, if there is an AOF saving child, it is killed **even** if AOF
  is disabled. This can happen if BGREWRITEAOF is used when AOF is off.

* Client pause now has end time and type (WRITE or ALL) per purpose. The
  different pause purposes are *CLIENT PAUSE command*, *failover* and
  *shutdown*. If clients are unpaused for one purpose, it doesn't affect client
  pause for other purposes. For example, the CLIENT UNPAUSE command doesn't
  affect client pause initiated by the failover or shutdown procedures. A completed
  failover or a failed shutdown doesn't unpause clients paused by the CLIENT
  PAUSE command.

Notes:

* DEBUG RESTART doesn't wait for replicas.

* We already have a warning logged when a replica disconnects. This means that
  if any replica connection is lost during the shutdown, it is either logged as
  disconnected or as lagging at the time of exit.

Co-authored-by: Oran Agra <oran@redislabs.com>
											
										
										
											2022-01-02 02:50:15 -05:00
+								            [string match "*slave0:*,offset=$offset,*" [$master info replication]] &&
 								            $offset == [status $replica master_repl_offset] &&
 								            $offset == [status $sub_replica master_repl_offset]
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								        } else {
 								            show_cluster_status
 								            fail "Replicas and master offsets were unable to match *exactly*."
 								        }
 								        set offset [status $master master_repl_offset]
 								        $replica config resetstat
 								        catch {
-												Wait for replicas when shutting down (#9872)

To avoid data loss, this commit adds a grace period for lagging replicas to
catch up the replication offset.

Done:

* Wait for replicas when shutdown is triggered by SIGTERM and SIGINT.

* Wait for replicas when shutdown is triggered by the SHUTDOWN command. A new
  blocked client type BLOCKED_SHUTDOWN is introduced, allowing multiple clients
  to call SHUTDOWN in parallel.
  Note that they don't expect a response unless an error happens and shutdown is aborted.

* Log warning for each replica lagging behind when finishing shutdown.

* CLIENT_PAUSE_WRITE while waiting for replicas.

* Configurable grace period 'shutdown-timeout' in seconds (default 10).

* New flags for the SHUTDOWN command:

    - NOW disables the grace period for lagging replicas.

    - FORCE ignores errors writing the RDB or AOF files which would normally
      prevent a shutdown.

    - ABORT cancels ongoing shutdown. Can't be combined with other flags.

* New field in the output of the INFO command: 'shutdown_in_milliseconds'. The
  value is the remaining maximum time to wait for lagging replicas before
  finishing the shutdown. This field is present in the Server section **only**
  during shutdown.

Not directly related:

* When shutting down, if there is an AOF saving child, it is killed **even** if AOF
  is disabled. This can happen if BGREWRITEAOF is used when AOF is off.

* Client pause now has end time and type (WRITE or ALL) per purpose. The
  different pause purposes are *CLIENT PAUSE command*, *failover* and
  *shutdown*. If clients are unpaused for one purpose, it doesn't affect client
  pause for other purposes. For example, the CLIENT UNPAUSE command doesn't
  affect client pause initiated by the failover or shutdown procedures. A completed
  failover or a failed shutdown doesn't unpause clients paused by the CLIENT
  PAUSE command.

Notes:

* DEBUG RESTART doesn't wait for replicas.

* We already have a warning logged when a replica disconnects. This means that
  if any replica connection is lost during the shutdown, it is either logged as
  disconnected or as lagging at the time of exit.

Co-authored-by: Oran Agra <oran@redislabs.com>
											
										
										
											2022-01-02 02:50:15 -05:00
+								            # Unlike the test above, here we use SIGTERM, which behaves
 								            # differently compared to SHUTDOWN NOW if there are lagging
 								            # replicas. This is just to increase coverage and let each test use
 								            # a different shutdown approach. In this case there are no lagging
 								            # replicas though.
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								            restart_server 0 true false
 								            set master [srv 0 client]
 								        }
 								        wait_for_condition 50 1000 {
 								            [status $replica master_link_status] eq {up} &&
 								            [status $sub_replica master_link_status] eq {up}
 								        } else {
 								            fail "Replicas didn't sync after master restart"
 								        }
 								        set expired_offset [status $master repl_backlog_histlen]
 								        # Stale keys expired and master_repl_offset grows correctly
 								        assert {[status $master rdb_last_load_keys_expired] == 1024}
 								        assert {[status $master master_repl_offset] == [expr $offset+$expired_offset]}
 								        # Partial resync after Master restart
 								        assert {[status $master sync_partial_ok] == 1}
 								        assert {[status $replica sync_partial_ok] == 1}
 								        set digest [$master debug digest]
 								        assert {$digest eq [$replica debug digest]}
 								        assert {$digest eq [$sub_replica debug digest]}
 								    }
 								    test "PSYNC2: Full resync after Master restart when too many key expired" {
 								        $master config set repl-backlog-size 16384
 								        $master config rewrite
 								        $master debug set-active-expire 0
-												Replication backlog and replicas use one global shared replication buffer (#9166)

## Background
For redis master, one replica uses one copy of replication buffer, that is a big waste of memory,
more replicas more waste, and allocate/free memory for every reply list also cost much.
If we set client-output-buffer-limit small and write traffic is heavy, master may disconnect with
replicas and can't finish synchronization with replica. If we set  client-output-buffer-limit big,
master may be OOM when there are many replicas that separately keep much memory.
Because replication buffers of different replica client are the same, one simple idea is that
all replicas only use one replication buffer, that will effectively save memory.

Since replication backlog content is the same as replicas' output buffer, now we
can discard replication backlog memory and use global shared replication buffer
to implement replication backlog mechanism.

## Implementation
I create one global "replication buffer" which contains content of replication stream.
The structure of "replication buffer" is similar to the reply list that exists in every client.
But the node of list is `replBufBlock`, which has `id, repl_offset, refcount` fields.
```c
/* Replication buffer blocks is the list of replBufBlock.
 *
 * +--------------+       +--------------+       +--------------+
 * | refcount = 1 |  ...  | refcount = 0 |  ...  | refcount = 2 |
 * +--------------+       +--------------+       +--------------+
 *      |                                            /       \
 *      |                                           /         \
 *      |                                          /           \
 *  Repl Backlog                               Replia_A      Replia_B
 * 
 * Each replica or replication backlog increments only the refcount of the
 * 'ref_repl_buf_node' which it points to. So when replica walks to the next
 * node, it should first increase the next node's refcount, and when we trim
 * the replication buffer nodes, we remove node always from the head node which
 * refcount is 0. If the refcount of the head node is not 0, we must stop
 * trimming and never iterate the next node. */

/* Similar with 'clientReplyBlock', it is used for shared buffers between
 * all replica clients and replication backlog. */
typedef struct replBufBlock {
    int refcount;           /* Number of replicas or repl backlog using. */
    long long id;           /* The unique incremental number. */
    long long repl_offset;  /* Start replication offset of the block. */
    size_t size, used;
    char buf[];
} replBufBlock;
```
So now when we feed replication stream into replication backlog and all replicas, we only need
to feed stream into replication buffer `feedReplicationBuffer`. In this function, we set some fields of
replication backlog and replicas to references of the global replication buffer blocks. And we also
need to check replicas' output buffer limit to free if exceeding `client-output-buffer-limit`, and trim
replication backlog if exceeding `repl-backlog-size`.

When sending reply to replicas, we also need to iterate replication buffer blocks and send its
content, when totally sending one block for replica, we decrease current node count and
increase the next current node count, and then free the block which reference is 0 from the
head of replication buffer blocks.

Since now we use linked list to manage replication backlog, it may cost much time for iterating
all linked list nodes to find corresponding replication buffer node. So we create a rax tree to
store some nodes  for index, but to avoid rax tree occupying too much memory, i record
one per 64 nodes for index.

Currently, to make partial resynchronization as possible as much, we always let replication
backlog as the last reference of replication buffer blocks, backlog size may exceeds our setting
if slow replicas that reference vast replication buffer blocks, and this method doesn't increase
memory usage since they share replication buffer. To avoid freezing server for freeing unreferenced
replication buffer blocks when we need to trim backlog for exceeding backlog size setting,
we trim backlog incrementally (free 64 blocks per call now), and make it faster in
`beforeSleep` (free 640 blocks).

### Other changes
- `mem_total_replication_buffers`: we add this field in INFO command, it means the total
  memory of replication buffers used.
- `mem_clients_slaves`:  now even replica is slow to replicate, and its output buffer memory
  is not 0, but it still may be 0, since replication backlog and replicas share one global replication
  buffer, only if replication buffer memory is more than the repl backlog setting size, we consider
  the excess as replicas' memory. Otherwise, we think replication buffer memory is the consumption
  of repl backlog.
- Key eviction
  Since all replicas and replication backlog share global replication buffer, we think only the
  part of exceeding backlog size the extra separate consumption of replicas.
  Because we trim backlog incrementally in the background, backlog size may exceeds our
  setting if slow replicas that reference vast replication buffer blocks disconnect.
  To avoid massive eviction loop, we don't count the delayed freed replication backlog into
  used memory even if there are no replicas, i.e. we also regard this memory as replicas's memory.
- `client-output-buffer-limit` check for replica clients
  It doesn't make sense to set the replica clients output buffer limit lower than the repl-backlog-size
  config (partial sync will succeed and then replica will get disconnected). Such a configuration is
  ignored (the size of repl-backlog-size will be used). This doesn't have memory consumption
  implications since the replica client will share the backlog buffers memory.
- Drop replication backlog after loading data if needed
  We always create replication backlog if server is a master, we need it because we put DELs in
  it when loading expired keys in RDB, but if RDB doesn't have replication info or there is no rdb,
  it is not possible to support partial resynchronization, to avoid extra memory of replication backlog,
  we drop it.
- Multi IO threads
 Since all replicas and replication backlog use global replication buffer,  if I/O threads are enabled,
  to guarantee data accessing thread safe, we must let main thread handle sending the output buffer
  to all replicas. But before, other IO threads could handle sending output buffer of all replicas.

## Other optimizations
This solution resolve some other problem:
- When replicas disconnect with master since of out of output buffer limit, releasing the output
  buffer of replicas may freeze server if we set big `client-output-buffer-limit` for replicas, but now,
  it doesn't cause freezing.
- This implementation may mitigate reply list copy cost time(also freezes server) when one replication
  has huge reply buffer and another replica can copy buffer for full synchronization. now, we just copy
  reference info, it is very light.
- If we set replication backlog size big, it also may cost much time to copy replication backlog into
  replica's output buffer. But this commit eliminates this problem.
- Resizing replication backlog size doesn't empty current replication backlog content.

											
										
										
											2021-10-25 02:24:31 -04:00
+								        # Make sure replication backlog is full and will be trimmed.
 								        for {set j 0} {$j < 2048} {incr j} {
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								            $master select [expr $j%16]
 								            $master set $j somevalue px 10
 								        }
 								        after 20
 								        wait_for_condition 500 100 {
 								            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
 								            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
 								        } else {
 								            fail "Replicas and master offsets were unable to match *exactly*."
 								        }
 								        $replica config resetstat
 								        catch {
-												Wait for replicas when shutting down (#9872)

To avoid data loss, this commit adds a grace period for lagging replicas to
catch up the replication offset.

Done:

* Wait for replicas when shutdown is triggered by SIGTERM and SIGINT.

* Wait for replicas when shutdown is triggered by the SHUTDOWN command. A new
  blocked client type BLOCKED_SHUTDOWN is introduced, allowing multiple clients
  to call SHUTDOWN in parallel.
  Note that they don't expect a response unless an error happens and shutdown is aborted.

* Log warning for each replica lagging behind when finishing shutdown.

* CLIENT_PAUSE_WRITE while waiting for replicas.

* Configurable grace period 'shutdown-timeout' in seconds (default 10).

* New flags for the SHUTDOWN command:

    - NOW disables the grace period for lagging replicas.

    - FORCE ignores errors writing the RDB or AOF files which would normally
      prevent a shutdown.

    - ABORT cancels ongoing shutdown. Can't be combined with other flags.

* New field in the output of the INFO command: 'shutdown_in_milliseconds'. The
  value is the remaining maximum time to wait for lagging replicas before
  finishing the shutdown. This field is present in the Server section **only**
  during shutdown.

Not directly related:

* When shutting down, if there is an AOF saving child, it is killed **even** if AOF
  is disabled. This can happen if BGREWRITEAOF is used when AOF is off.

* Client pause now has end time and type (WRITE or ALL) per purpose. The
  different pause purposes are *CLIENT PAUSE command*, *failover* and
  *shutdown*. If clients are unpaused for one purpose, it doesn't affect client
  pause for other purposes. For example, the CLIENT UNPAUSE command doesn't
  affect client pause initiated by the failover or shutdown procedures. A completed
  failover or a failed shutdown doesn't unpause clients paused by the CLIENT
  PAUSE command.

Notes:

* DEBUG RESTART doesn't wait for replicas.

* We already have a warning logged when a replica disconnects. This means that
  if any replica connection is lost during the shutdown, it is either logged as
  disconnected or as lagging at the time of exit.

Co-authored-by: Oran Agra <oran@redislabs.com>
											
										
										
											2022-01-02 02:50:15 -05:00
+								            # Unlike the test above, here we use SIGTERM. This is just to
 								            # increase coverage and let each test use a different shutdown
 								            # approach.
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								            restart_server 0 true false
 								            set master [srv 0 client]
 								        }
 								        wait_for_condition 50 1000 {
 								            [status $replica master_link_status] eq {up} &&
 								            [status $sub_replica master_link_status] eq {up}
 								        } else {
 								            fail "Replicas didn't sync after master restart"
 								        }
 								        # Replication backlog is full
 								        assert {[status $master repl_backlog_first_byte_offset] > [status $master second_repl_offset]}
 								        assert {[status $master sync_partial_ok] == 0}
 								        assert {[status $master sync_full] == 1}
-												Replication backlog and replicas use one global shared replication buffer (#9166)

## Background
For redis master, one replica uses one copy of replication buffer, that is a big waste of memory,
more replicas more waste, and allocate/free memory for every reply list also cost much.
If we set client-output-buffer-limit small and write traffic is heavy, master may disconnect with
replicas and can't finish synchronization with replica. If we set  client-output-buffer-limit big,
master may be OOM when there are many replicas that separately keep much memory.
Because replication buffers of different replica client are the same, one simple idea is that
all replicas only use one replication buffer, that will effectively save memory.

Since replication backlog content is the same as replicas' output buffer, now we
can discard replication backlog memory and use global shared replication buffer
to implement replication backlog mechanism.

## Implementation
I create one global "replication buffer" which contains content of replication stream.
The structure of "replication buffer" is similar to the reply list that exists in every client.
But the node of list is `replBufBlock`, which has `id, repl_offset, refcount` fields.
```c
/* Replication buffer blocks is the list of replBufBlock.
 *
 * +--------------+       +--------------+       +--------------+
 * | refcount = 1 |  ...  | refcount = 0 |  ...  | refcount = 2 |
 * +--------------+       +--------------+       +--------------+
 *      |                                            /       \
 *      |                                           /         \
 *      |                                          /           \
 *  Repl Backlog                               Replia_A      Replia_B
 * 
 * Each replica or replication backlog increments only the refcount of the
 * 'ref_repl_buf_node' which it points to. So when replica walks to the next
 * node, it should first increase the next node's refcount, and when we trim
 * the replication buffer nodes, we remove node always from the head node which
 * refcount is 0. If the refcount of the head node is not 0, we must stop
 * trimming and never iterate the next node. */

/* Similar with 'clientReplyBlock', it is used for shared buffers between
 * all replica clients and replication backlog. */
typedef struct replBufBlock {
    int refcount;           /* Number of replicas or repl backlog using. */
    long long id;           /* The unique incremental number. */
    long long repl_offset;  /* Start replication offset of the block. */
    size_t size, used;
    char buf[];
} replBufBlock;
```
So now when we feed replication stream into replication backlog and all replicas, we only need
to feed stream into replication buffer `feedReplicationBuffer`. In this function, we set some fields of
replication backlog and replicas to references of the global replication buffer blocks. And we also
need to check replicas' output buffer limit to free if exceeding `client-output-buffer-limit`, and trim
replication backlog if exceeding `repl-backlog-size`.

When sending reply to replicas, we also need to iterate replication buffer blocks and send its
content, when totally sending one block for replica, we decrease current node count and
increase the next current node count, and then free the block which reference is 0 from the
head of replication buffer blocks.

Since now we use linked list to manage replication backlog, it may cost much time for iterating
all linked list nodes to find corresponding replication buffer node. So we create a rax tree to
store some nodes  for index, but to avoid rax tree occupying too much memory, i record
one per 64 nodes for index.

Currently, to make partial resynchronization as possible as much, we always let replication
backlog as the last reference of replication buffer blocks, backlog size may exceeds our setting
if slow replicas that reference vast replication buffer blocks, and this method doesn't increase
memory usage since they share replication buffer. To avoid freezing server for freeing unreferenced
replication buffer blocks when we need to trim backlog for exceeding backlog size setting,
we trim backlog incrementally (free 64 blocks per call now), and make it faster in
`beforeSleep` (free 640 blocks).

### Other changes
- `mem_total_replication_buffers`: we add this field in INFO command, it means the total
  memory of replication buffers used.
- `mem_clients_slaves`:  now even replica is slow to replicate, and its output buffer memory
  is not 0, but it still may be 0, since replication backlog and replicas share one global replication
  buffer, only if replication buffer memory is more than the repl backlog setting size, we consider
  the excess as replicas' memory. Otherwise, we think replication buffer memory is the consumption
  of repl backlog.
- Key eviction
  Since all replicas and replication backlog share global replication buffer, we think only the
  part of exceeding backlog size the extra separate consumption of replicas.
  Because we trim backlog incrementally in the background, backlog size may exceeds our
  setting if slow replicas that reference vast replication buffer blocks disconnect.
  To avoid massive eviction loop, we don't count the delayed freed replication backlog into
  used memory even if there are no replicas, i.e. we also regard this memory as replicas's memory.
- `client-output-buffer-limit` check for replica clients
  It doesn't make sense to set the replica clients output buffer limit lower than the repl-backlog-size
  config (partial sync will succeed and then replica will get disconnected). Such a configuration is
  ignored (the size of repl-backlog-size will be used). This doesn't have memory consumption
  implications since the replica client will share the backlog buffers memory.
- Drop replication backlog after loading data if needed
  We always create replication backlog if server is a master, we need it because we put DELs in
  it when loading expired keys in RDB, but if RDB doesn't have replication info or there is no rdb,
  it is not possible to support partial resynchronization, to avoid extra memory of replication backlog,
  we drop it.
- Multi IO threads
 Since all replicas and replication backlog use global replication buffer,  if I/O threads are enabled,
  to guarantee data accessing thread safe, we must let main thread handle sending the output buffer
  to all replicas. But before, other IO threads could handle sending output buffer of all replicas.

## Other optimizations
This solution resolve some other problem:
- When replicas disconnect with master since of out of output buffer limit, releasing the output
  buffer of replicas may freeze server if we set big `client-output-buffer-limit` for replicas, but now,
  it doesn't cause freezing.
- This implementation may mitigate reply list copy cost time(also freezes server) when one replication
  has huge reply buffer and another replica can copy buffer for full synchronization. now, we just copy
  reference info, it is very light.
- If we set replication backlog size big, it also may cost much time to copy replication backlog into
  replica's output buffer. But this commit eliminates this problem.
- Resizing replication backlog size doesn't empty current replication backlog content.

											
										
										
											2021-10-25 02:24:31 -04:00
+								        assert {[status $master rdb_last_load_keys_expired] == 2048}
-												PSYNC2: make partial sync possible after master reboot (#8015)

The main idea is how to allow a master to load replication info from RDB file when rebooting, if master can load replication info it means that replicas may have the chance to psync with master, it can save much traffic.

The key point is we need guarantee safety and consistency, so there
are two differences between master and replica:

1. master would load the replication info as secondary ID and
   offset, in case other masters have the same replid.
2. when master loading RDB, it would propagate expired keys as DEL
   command to replication backlog, then replica can receive these
   commands to delete stale keys.
   p.s. the expired keys when RDB loading is useful for users, so
   we show it as `rdb_last_load_keys_expired` and `rdb_last_load_keys_loaded` in info persistence.

Moreover, after load replication info, master should update
`no_replica_time` in case loading RDB cost too long time.
											
										
										
											2021-09-13 03:39:11 -04:00
+								        assert {[status $replica sync_full] == 1}
 								        set digest [$master debug digest]
 								        assert {$digest eq [$replica debug digest]}
 								        assert {$digest eq [$sub_replica debug digest]}
 								    }
 								}}}