redict/tests/unit/latency-monitor.tcl

proc latency_histogram {cmd} {
    return [lindex [r latency histogram $cmd] 1]
}

start_server {tags {"latency-monitor needs:latency"}} {
    # Set a threshold high enough to avoid spurious latency events.
    r config set latency-monitor-threshold 200
    r latency reset

    test {LATENCY HISTOGRAM with empty histogram} {
        r config resetstat
        assert_match {} [latency_histogram set]
        assert {[llength [r latency histogram]] == 0}
    }

    test {LATENCY HISTOGRAM all commands} {
        r config resetstat
        r set a b
        r set c d
        assert_match {calls 2 histogram_usec *} [latency_histogram set]
    }

    test {LATENCY HISTOGRAM with a subset of commands} {
        r config resetstat
        r set a b
        r set c d
        r get a
        r hset f k v
        r hgetall f
        assert_match {calls 2 histogram_usec *} [latency_histogram set]
        assert_match {calls 1 histogram_usec *} [latency_histogram hset]
        assert_match {calls 1 histogram_usec *} [latency_histogram hgetall]
        assert_match {calls 1 histogram_usec *} [latency_histogram get]
        assert {[llength [r latency histogram]] == 8}
        assert {[llength [r latency histogram set get]] == 4}
    }

    test {LATENCY HISTOGRAM command} {
        r config resetstat
        r set a b
        r get a
        assert {[llength [r latency histogram]] == 4}
        assert {[llength [r latency histogram set get]] == 4}
    }

    test {LATENCY HISTOGRAM with wrong command name skips the invalid one} {
        r config resetstat
        assert {[llength [r latency histogram blabla]] == 0}
        assert {[llength [r latency histogram blabla blabla2 set get]] == 0}
        r set a b
        r get a
        assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 1]
        assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 3]
        assert {[string length [r latency histogram blabla set get]] > 0}
    }

    test {Test latency events logging} {
        r debug sleep 0.3
        after 1100
        r debug sleep 0.4
        after 1100
        r debug sleep 0.5
        assert {[r latency history command] >= 3}
    } {} {needs:debug}

    test {LATENCY HISTORY output is ok} {
        set min 250
        set max 450
        foreach event [r latency history command] {
            lassign $event time latency
            if {!$::no_latency} {
                assert {$latency >= $min && $latency <= $max}
            }
            incr min 100
            incr max 100
            set last_time $time ; # Used in the next test
        }
    }

    test {LATENCY LATEST output is ok} {
        foreach event [r latency latest] {
            lassign $event eventname time latency max
            assert {$eventname eq "command"}
            if {!$::no_latency} {
                assert {$max >= 450 & $max <= 650}
                assert {$time == $last_time}
            }
            break
        }
    }

    test {LATENCY of expire events are correctly collected} {
        r config set latency-monitor-threshold 20
        r flushdb
        if {$::valgrind} {set count 100000} else {set count 1000000}
        r eval {
            local i = 0
            while (i < tonumber(ARGV[1])) do
                redis.call('sadd',KEYS[1],i)
                i = i+1
             end
        } 1 mybigkey $count
        r pexpire mybigkey 50
        wait_for_condition 5 100 {
            [r dbsize] == 0
        } else {
            fail "key wasn't expired"
        }
        assert_match {*expire-cycle*} [r latency latest]
    }

    test {LATENCY HISTORY / RESET with wrong event name is fine} {
        assert {[llength [r latency history blabla]] == 0}
        assert {[r latency reset blabla] == 0}
    }

    test {LATENCY DOCTOR produces some output} {
        assert {[string length [r latency doctor]] > 0}
    }

    test {LATENCY RESET is able to reset events} {
        assert {[r latency reset] > 0}
        assert {[r latency latest] eq {}}
    }

    test {LATENCY HELP should not have unexpected options} {
        catch {r LATENCY help xxx} e
        assert_match "*wrong number of arguments*" $e
    }
}
Added INFO LATENCYSTATS section: latency by percentile distribution/latency by cumulative distribution of latencies (#9462) # Short description The Redis extended latency stats track per command latencies and enables: - exporting the per-command percentile distribution via the `INFO LATENCYSTATS` command. ( percentile distribution is not mergeable between cluster nodes ). - exporting the per-command cumulative latency distributions via the `LATENCY HISTOGRAM` command. Using the cumulative distribution of latencies we can merge several stats from different cluster nodes to calculate aggregate metrics . By default, the extended latency monitoring is enabled since the overhead of keeping track of the command latency is very small. If you don't want to track extended latency metrics, you can easily disable it at runtime using the command: - `CONFIG SET latency-tracking no` By default, the exported latency percentiles are the p50, p99, and p999. You can alter them at runtime using the command: - `CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"` ## Some details: - The total size per histogram should sit around 40 KiB. We only allocate those 40KiB when a command was called for the first time. - With regards to the WRITE overhead As seen below, there is no measurable overhead on the achievable ops/sec or full latency spectrum on the client. Including also the measured redis-benchmark for unstable vs this branch. - We track from 1 nanosecond to 1 second ( everything above 1 second is considered +Inf ) ## `INFO LATENCYSTATS` exposition format - Format: `latency_percentiles_usec_<CMDNAME>:p0=XX,p50....` ## `LATENCY HISTOGRAM [command ...]` exposition format Return a cumulative distribution of latencies in the format of a histogram for the specified command names. The histogram is composed of a map of time buckets: - Each representing a latency range, between 1 nanosecond and roughly 1 second. - Each bucket covers twice the previous bucket's range. - Empty buckets are not printed. - Everything above 1 sec is considered +Inf. - At max there will be log2(1000000000)=30 buckets We reply a map for each command in the format: `<command name> : { `calls`: <total command calls> , `histogram` : { <bucket 1> : latency , < bucket 2> : latency, ... } }` Co-authored-by: Oran Agra <oran@redislabs.com> 2022-01-05 07:01:05 -05:00			`proc latency_histogram {cmd} {`
			`return [lindex [r latency histogram $cmd] 1]`
			`}`

Improve test suite to handle external servers better. (#9033) This commit revives the improves the ability to run the test suite against external servers, instead of launching and managing `redis-server` processes as part of the test fixture. This capability existed in the past, using the `--host` and `--port` options. However, it was quite limited and mostly useful when running a specific tests. Attempting to run larger chunks of the test suite experienced many issues: * Many tests depend on being able to start and control `redis-server` themselves, and there's no clear distinction between external server compatible and other tests. * Cluster mode is not supported (resulting with `CROSSSLOT` errors). This PR cleans up many things and makes it possible to run the entire test suite against an external server. It also provides more fine grained controls to handle cases where the external server supports a subset of the Redis commands, limited number of databases, cluster mode, etc. The tests directory now contains a `README.md` file that describes how this works. This commit also includes additional cleanups and fixes: * Tests can now be tagged. * Tag-based selection is now unified across `start_server`, `tags` and `test`. * More information is provided about skipped or ignored tests. * Repeated patterns in tests have been extracted to common procedures, both at a global level and on a per-test file basis. * Cleaned up some cases where test setup was based on a previous test executing (a major anti-pattern that repeats itself in many places). * Cleaned up some cases where test teardown was not part of a test (in the future we should have dedicated teardown code that executes even when tests fail). * Fixed some tests that were flaky running on external servers. 2021-06-09 08:13:24 -04:00			`start_server {tags {"latency-monitor needs:latency"}} {`
LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00			`# Set a threshold high enough to avoid spurious latency events.`
			`r config set latency-monitor-threshold 200`
			`r latency reset`

Added INFO LATENCYSTATS section: latency by percentile distribution/latency by cumulative distribution of latencies (#9462) # Short description The Redis extended latency stats track per command latencies and enables: - exporting the per-command percentile distribution via the `INFO LATENCYSTATS` command. ( percentile distribution is not mergeable between cluster nodes ). - exporting the per-command cumulative latency distributions via the `LATENCY HISTOGRAM` command. Using the cumulative distribution of latencies we can merge several stats from different cluster nodes to calculate aggregate metrics . By default, the extended latency monitoring is enabled since the overhead of keeping track of the command latency is very small. If you don't want to track extended latency metrics, you can easily disable it at runtime using the command: - `CONFIG SET latency-tracking no` By default, the exported latency percentiles are the p50, p99, and p999. You can alter them at runtime using the command: - `CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"` ## Some details: - The total size per histogram should sit around 40 KiB. We only allocate those 40KiB when a command was called for the first time. - With regards to the WRITE overhead As seen below, there is no measurable overhead on the achievable ops/sec or full latency spectrum on the client. Including also the measured redis-benchmark for unstable vs this branch. - We track from 1 nanosecond to 1 second ( everything above 1 second is considered +Inf ) ## `INFO LATENCYSTATS` exposition format - Format: `latency_percentiles_usec_<CMDNAME>:p0=XX,p50....` ## `LATENCY HISTOGRAM [command ...]` exposition format Return a cumulative distribution of latencies in the format of a histogram for the specified command names. The histogram is composed of a map of time buckets: - Each representing a latency range, between 1 nanosecond and roughly 1 second. - Each bucket covers twice the previous bucket's range. - Empty buckets are not printed. - Everything above 1 sec is considered +Inf. - At max there will be log2(1000000000)=30 buckets We reply a map for each command in the format: `<command name> : { `calls`: <total command calls> , `histogram` : { <bucket 1> : latency , < bucket 2> : latency, ... } }` Co-authored-by: Oran Agra <oran@redislabs.com> 2022-01-05 07:01:05 -05:00			`test {LATENCY HISTOGRAM with empty histogram} {`
			`r config resetstat`
			`assert_match {} [latency_histogram set]`
			`assert {[llength [r latency histogram]] == 0}`
			`}`

			`test {LATENCY HISTOGRAM all commands} {`
			`r config resetstat`
			`r set a b`
			`r set c d`
			`assert_match {calls 2 histogram_usec *} [latency_histogram set]`
			`}`

			`test {LATENCY HISTOGRAM with a subset of commands} {`
			`r config resetstat`
			`r set a b`
			`r set c d`
			`r get a`
			`r hset f k v`
			`r hgetall f`
			`assert_match {calls 2 histogram_usec *} [latency_histogram set]`
			`assert_match {calls 1 histogram_usec *} [latency_histogram hset]`
			`assert_match {calls 1 histogram_usec *} [latency_histogram hgetall]`
			`assert_match {calls 1 histogram_usec *} [latency_histogram get]`
			`assert {[llength [r latency histogram]] == 8}`
			`assert {[llength [r latency histogram set get]] == 4}`
			`}`

			`test {LATENCY HISTOGRAM command} {`
			`r config resetstat`
			`r set a b`
			`r get a`
			`assert {[llength [r latency histogram]] == 4}`
			`assert {[llength [r latency histogram set get]] == 4}`
			`}`

			`test {LATENCY HISTOGRAM with wrong command name skips the invalid one} {`
			`r config resetstat`
			`assert {[llength [r latency histogram blabla]] == 0}`
			`assert {[llength [r latency histogram blabla blabla2 set get]] == 0}`
			`r set a b`
			`r get a`
			`assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 1]`
			`assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 3]`
			`assert {[string length [r latency histogram blabla set get]] > 0}`
			`}`

LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00			`test {Test latency events logging} {`
			`r debug sleep 0.3`
			`after 1100`
			`r debug sleep 0.4`
			`after 1100`
			`r debug sleep 0.5`
			`assert {[r latency history command] >= 3}`
Add external test that runs without debug command (#9964) - add needs:debug flag for some tests - disable "save" in external tests (speedup?) - use debug_digest proc instead of debug command directly so it can be skipped - use OBJECT ENCODING instead of DEBUG OBJECT to get encoding - add a proc for OBJECT REFCOUNT so it can be skipped - move a bunch of tests in latency_monitor tests to happen later so that latency monitor has some values in it - add missing close_replication_stream calls - make sure to close the temp client if DEBUG LOG fails 2021-12-19 10:41:51 -05:00			`} {} {needs:debug}`
LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00
			`test {LATENCY HISTORY output is ok} {`
			`set min 250`
			`set max 450`
			`foreach event [r latency history command] {`
			`lassign $event time latency`
Add a --no-latency tests flag. (#7939) Useful for running tests on systems which may be way slower than usual. 2020-10-22 04:10:53 -04:00			`if {!$::no_latency} {`
			`assert {$latency >= $min && $latency <= $max}`
			`}`
LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00			`incr min 100`
			`incr max 100`
			`set last_time $time ; # Used in the next test`
			`}`
			`}`

			`test {LATENCY LATEST output is ok} {`
			`foreach event [r latency latest] {`
			`lassign $event eventname time latency max`
			`assert {$eventname eq "command"}`
Add a --no-latency tests flag. (#7939) Useful for running tests on systems which may be way slower than usual. 2020-10-22 04:10:53 -04:00			`if {!$::no_latency} {`
			`assert {$max >= 450 & $max <= 650}`
			`assert {$time == $last_time}`
			`}`
LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00			`break`
			`}`
			`}`

Test: regression test for latency expire events logging bug. Regression for #4452. 2017-11-24 12:33:31 -05:00			`test {LATENCY of expire events are correctly collected} {`
			`r config set latency-monitor-threshold 20`
fix recently added time sensitive tests failing with valgrind (#7512) interestingly the latency monitor test fails because valgrind is slow enough so that the time inside PEXPIREAT command from the moment of the first mstime() call to get the basetime until checkAlreadyExpired calls mstime() again is more than 1ms, and that test was too sensitive. using this opportunity to speed up the test (unrelated to the failure) the fix is just the longer time passed to PEXPIRE. 2020-07-13 09:40:03 -04:00			`r flushdb`
			`if {$::valgrind} {set count 100000} else {set count 1000000}`
Test: regression test for latency expire events logging bug. Regression for #4452. 2017-11-24 12:33:31 -05:00			`r eval {`
			`local i = 0`
fix recently added time sensitive tests failing with valgrind (#7512) interestingly the latency monitor test fails because valgrind is slow enough so that the time inside PEXPIREAT command from the moment of the first mstime() call to get the basetime until checkAlreadyExpired calls mstime() again is more than 1ms, and that test was too sensitive. using this opportunity to speed up the test (unrelated to the failure) the fix is just the longer time passed to PEXPIRE. 2020-07-13 09:40:03 -04:00			`while (i < tonumber(ARGV[1])) do`
			`redis.call('sadd',KEYS[1],i)`
Test: regression test for latency expire events logging bug. Regression for #4452. 2017-11-24 12:33:31 -05:00			`i = i+1`
			`end`
fix recently added time sensitive tests failing with valgrind (#7512) interestingly the latency monitor test fails because valgrind is slow enough so that the time inside PEXPIREAT command from the moment of the first mstime() call to get the basetime until checkAlreadyExpired calls mstime() again is more than 1ms, and that test was too sensitive. using this opportunity to speed up the test (unrelated to the failure) the fix is just the longer time passed to PEXPIRE. 2020-07-13 09:40:03 -04:00			`} 1 mybigkey $count`
			`r pexpire mybigkey 50`
			`wait_for_condition 5 100 {`
			`[r dbsize] == 0`
			`} else {`
			`fail "key wasn't expired"`
			`}`
Test: regression test for latency expire events logging bug. Regression for #4452. 2017-11-24 12:33:31 -05:00			`assert_match {expire-cycle} [r latency latest]`
			`}`
Fix command help for unexpected options (#7476) 2020-07-15 05:38:22 -04:00
Add external test that runs without debug command (#9964) - add needs:debug flag for some tests - disable "save" in external tests (speedup?) - use debug_digest proc instead of debug command directly so it can be skipped - use OBJECT ENCODING instead of DEBUG OBJECT to get encoding - add a proc for OBJECT REFCOUNT so it can be skipped - move a bunch of tests in latency_monitor tests to happen later so that latency monitor has some values in it - add missing close_replication_stream calls - make sure to close the temp client if DEBUG LOG fails 2021-12-19 10:41:51 -05:00			`test {LATENCY HISTORY / RESET with wrong event name is fine} {`
			`assert {[llength [r latency history blabla]] == 0}`
			`assert {[r latency reset blabla] == 0}`
			`}`

			`test {LATENCY DOCTOR produces some output} {`
			`assert {[string length [r latency doctor]] > 0}`
			`}`

			`test {LATENCY RESET is able to reset events} {`
			`assert {[r latency reset] > 0}`
			`assert {[r latency latest] eq {}}`
			`}`

Fix command help for unexpected options (#7476) 2020-07-15 05:38:22 -04:00			`test {LATENCY HELP should not have unexpected options} {`
			`catch {r LATENCY help xxx} e`
Treat subcommands as commands (#9504) ## Intro The purpose is to allow having different flags/ACL categories for subcommands (Example: CONFIG GET is ok-loading but CONFIG SET isn't) We create a small command table for every command that has subcommands and each subcommand has its own flags, etc. (same as a "regular" command) This commit also unites the Redis and the Sentinel command tables ## Affected commands CONFIG Used to have "admin ok-loading ok-stale no-script" Changes: 1. Dropped "ok-loading" in all except GET (this doesn't change behavior since there were checks in the code doing that) XINFO Used to have "read-only random" Changes: 1. Dropped "random" in all except CONSUMERS XGROUP Used to have "write use-memory" Changes: 1. Dropped "use-memory" in all except CREATE and CREATECONSUMER COMMAND No changes. MEMORY Used to have "random read-only" Changes: 1. Dropped "random" in PURGE and USAGE ACL Used to have "admin no-script ok-loading ok-stale" Changes: 1. Dropped "admin" in WHOAMI, GENPASS, and CAT LATENCY No changes. MODULE No changes. SLOWLOG Used to have "admin random ok-loading ok-stale" Changes: 1. Dropped "random" in RESET OBJECT Used to have "read-only random" Changes: 1. Dropped "random" in ENCODING and REFCOUNT SCRIPT Used to have "may-replicate no-script" Changes: 1. Dropped "may-replicate" in all except FLUSH and LOAD CLIENT Used to have "admin no-script random ok-loading ok-stale" Changes: 1. Dropped "random" in all except INFO and LIST 2. Dropped "admin" in ID, TRACKING, CACHING, GETREDIR, INFO, SETNAME, GETNAME, and REPLY STRALGO No changes. PUBSUB No changes. CLUSTER Changes: 1. Dropped "admin in countkeysinslots, getkeysinslot, info, nodes, keyslot, myid, and slots SENTINEL No changes. (note that DEBUG also fits, but we decided not to convert it since it's for debugging and anyway undocumented) ## New sub-command This commit adds another element to the per-command output of COMMAND, describing the list of subcommands, if any (in the same structure as "regular" commands) Also, it adds a new subcommand: ``` COMMAND LIST [FILTERBY (MODULE <module-name>\|ACLCAT <cat>\|PATTERN <pattern>)] ``` which returns a set of all commands (unless filters), but excluding subcommands. ## Module API A new module API, RM_CreateSubcommand, was added, in order to allow module writer to define subcommands ## ACL changes: 1. Now, that each subcommand is actually a command, each has its own ACL id. 2. The old mechanism of allowed_subcommands is redundant (blocking/allowing a subcommand is the same as blocking/allowing a regular command), but we had to keep it, to support the widespread usage of allowed_subcommands to block commands with certain args, that aren't subcommands (e.g. "-select +select\|0"). 3. I have renamed allowed_subcommands to allowed_firstargs to emphasize the difference. 4. Because subcommands are commands in ACL too, you can now use "-" to block subcommands (e.g. "+client -client\|kill"), which wasn't possible in the past. 5. It is also possible to use the allowed_firstargs mechanism with subcommand. For example: `+config -config\|set +config\|set\|loglevel` will block all CONFIG SET except for setting the log level. 6. All of the ACL changes above required some amount of refactoring. ## Misc 1. There are two approaches: Either each subcommand has its own function or all subcommands use the same function, determining what to do according to argv[0]. For now, I took the former approaches only with CONFIG and COMMAND, while other commands use the latter approach (for smaller blamelog diff). 2. Deleted memoryGetKeys: It is no longer needed because MEMORY USAGE now uses the "range" key spec. 4. Bugfix: GETNAME was missing from CLIENT's help message. 5. Sentinel and Redis now use the same table, with the same function pointer. Some commands have a different implementation in Sentinel, so we redirect them (these are ROLE, PUBLISH, and INFO). 6. Command stats now show the stats per subcommand (e.g. instead of stats just for "config" you will have stats for "config\|set", "config\|get", etc.) 7. It is now possible to use COMMAND directly on subcommands: COMMAND INFO CONFIG\|GET (The pipeline syntax was inspired from ACL, and can be used in functions lookupCommandBySds and lookupCommandByCString) 8. STRALGO is now a container command (has "help") ## Breaking changes: 1. Command stats now show the stats per subcommand (see (5) above) 2021-10-20 04:52:57 -04:00			`assert_match "wrong number of arguments" $e`
Fix command help for unexpected options (#7476) 2020-07-15 05:38:22 -04:00			`}`
LATENCY command / monitor basic tests. 2014-07-10 10:23:48 -04:00			`}`