Improve valgrind support for cluster tests (#7725)

- redirect valgrind reports to a dedicated file rather than console - try to avoid killing instances with SIGKILL so that we get the memory leak report (killing with SIGTERM before resorting to SIGKILL) - search for valgrind reports when done, print them and fail the tests - add --dont-clean option to keep the logs on exit - fix exit error code when crash is found (would have exited with 0) changes that affect the normal redis test suite: - refactor check_valgrind_errors into two functions one to search and one to report - move the search half into util.tcl to serve the cluster tests too - ignore "address range perms" valgrind warnings which seem non relevant.
2025-01-22 16:18:28 -05:00 · 2020-09-06 11:11:49 +03:00 · 2020-09-06 11:11:49 +03:00 · 2b998de460
commit 2b998de460
parent fe5da2e60d
3 changed files with 83 additions and 23 deletions
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@ -19,6 +19,7 @@ set ::verbose 0
 set ::valgrind 0
 set ::tls 0
 set ::pause_on_error 0
+set ::dont_clean 0
 set ::simulate_error 0
 set ::failed 0
 set ::sentinel_instances {}
@ -38,7 +39,7 @@ if {[catch {cd tmp}]} {

 # Execute the specified instance of the server specified by 'type', using
 # the provided configuration file. Returns the PID of the process.
-proc exec_instance {type cfgfile} {
+proc exec_instance {type dirname cfgfile} {
    if {$type eq "redis"} {
        set prgname redis-server
    } elseif {$type eq "sentinel"} {
@ -47,8 +48,9 @@ proc exec_instance {type cfgfile} {
        error "Unknown instance type."
    }

+    set errfile [file join $dirname err.txt]
    if {$::valgrind} {
-        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile 2>> $errfile &]
    } else {
        set pid [exec ../../../src/${prgname} $cfgfile &]
    }
@ -93,7 +95,7 @@ proc spawn_instance {type base_port count {conf {}}} {
        # Finally exec it and remember the pid for later cleanup.
        set retry 100
        while {$retry} {
-            set pid [exec_instance $type $cfgfile]
+            set pid [exec_instance $type $dirname $cfgfile]

            # Check availability
            if {[server_is_up 127.0.0.1 $port 100] == 0} {
@ -144,16 +146,60 @@ proc log_crashes {} {
                puts "\n*** Crash report found in $log ***"
                set found 1
            }
-            if {$found} {puts $line}
+            if {$found} {
+                puts $line
+                incr ::failed
+            }
        }
    }
+
+    set logs [glob */err.txt]
+    foreach log $logs {
+        set res [find_valgrind_errors $log]
+        if {$res != ""} {
+            puts $res
+            incr ::failed
+        }
+    }
+}
+
+proc is_alive pid {
+    if {[catch {exec ps -p $pid} err]} {
+        return 0
+    } else {
+        return 1
+    }
+}
+
+proc stop_instance pid {
+    catch {exec kill $pid}
+    if {$::valgrind} {
+        set max_wait 60000
+    } else {
+        set max_wait 10000
+    }
+    while {[is_alive $pid]} {
+        incr wait 10
+
+        if {$wait >= $max_wait} {
+            puts "Forcing process $pid to exit..."
+            catch {exec kill -KILL $pid}
+        } elseif {$wait % 1000 == 0} {
+            puts "Waiting for process $pid to exit..."
+        }
+        after 10
+    }
 }

 proc cleanup {} {
    puts "Cleaning up..."
-    log_crashes
    foreach pid $::pids {
-        catch {exec kill -9 $pid}
+        puts "killing stale instance $pid"
+        stop_instance $pid
+    }
+    log_crashes
+    if {$::dont_clean} {
+        return
    }
    foreach dir $::dirs {
        catch {exec rm -rf $dir}
@ -178,6 +224,8 @@ proc parse_options {} {
            set ::run_matching "*${val}*"
        } elseif {$opt eq "--pause-on-error"} {
            set ::pause_on_error 1
+        } elseif {$opt eq {--dont-clean}} {
+            set ::dont_clean 1
        } elseif {$opt eq "--fail"} {
            set ::simulate_error 1
        } elseif {$opt eq {--valgrind}} {
@ -191,6 +239,7 @@ proc parse_options {} {
            set ::tls 1
        } elseif {$opt eq "--help"} {
            puts "--single <pattern>      Only runs tests specified by pattern."
+            puts "--dont-clean            Keep log files on exit."
            puts "--pause-on-error        Pause for manual inspection on error."
            puts "--fail                  Simulate a test failure."
            puts "--valgrind              Run with valgrind."
@ -486,7 +535,7 @@ proc kill_instance {type id} {
        error "You tried to kill $type $id twice."
    }

-    exec kill -9 $pid
+    stop_instance $pid
    set_instance_attrib $type $id pid -1
    set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance

@ -521,7 +570,7 @@ proc restart_instance {type id} {

    # Execute the instance with its old setup and append the new pid
    # file for cleanup.
-    set pid [exec_instance $type $cfgfile]
+    set pid [exec_instance $type $dirname $cfgfile]
    set_instance_attrib $type $id pid $pid
    lappend ::pids $pid

--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@ -13,21 +13,9 @@ proc start_server_error {config_file error} {
 }

 proc check_valgrind_errors stderr {
-    set fd [open $stderr]
-    set buf [read $fd]
-    close $fd
-
-    # look for stack trace and other errors, or the absense of a leak free summary
-    if {[regexp -- { at 0x} $buf] ||
-        [regexp -- {Warning} $buf] ||
-        [regexp -- {Invalid} $buf] ||
-        [regexp -- {Mismatched} $buf] ||
-        [regexp -- {uninitialized} $buf] ||
-        [regexp -- {has a fishy} $buf] ||
-        [regexp -- {overlap} $buf] ||
-        (![regexp -- {definitely lost: 0 bytes} $buf] &&
-         ![regexp -- {no leaks are possible} $buf])} {
-        send_data_packet $::test_server_fd err "Valgrind error: $buf\n"
+    set res [find_valgrind_errors $stderr]
+    if {$res != ""} {
+        send_data_packet $::test_server_fd err "Valgrind error: $res\n"
    }
 }

--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@ -432,6 +432,29 @@ proc colorstr {color str} {
    }
 }

+proc find_valgrind_errors {stderr} {
+    set fd [open $stderr]
+    set buf [read $fd]
+    close $fd
+
+    # Look for stack trace (" at 0x") and other errors (Invalid, Mismatched, etc).
+    # Look for "Warnings", but not the "set address range perms". These don't indicate any real concern.
+    # Look for the absense of a leak free summary (happens when redis isn't terminated properly).
+    if {[regexp -- { at 0x} $buf] ||
+        [regexp -- {^(?=.*Warning)(?:(?!set address range perms).)*$} $buf] ||
+        [regexp -- {Invalid} $buf] ||
+        [regexp -- {Mismatched} $buf] ||
+        [regexp -- {uninitialized} $buf] ||
+        [regexp -- {has a fishy} $buf] ||
+        [regexp -- {overlap} $buf] ||
+        (![regexp -- {definitely lost: 0 bytes} $buf] &&
+         ![regexp -- {no leaks are possible} $buf])} {
+        return $buf
+    }
+
+    return ""
+}
+
 # Execute a background process writing random data for the specified number
 # of seconds to the specified Redis instance.
 proc start_write_load {host port seconds} {