mirror of
https://codeberg.org/redict/redict.git
synced 2025-01-23 08:38:27 -05:00
Merge pull request #7289 from oranagra/defrag_edge_case
fix a rare active defrag edge case bug leading to stagnation
This commit is contained in:
commit
af34245692
@ -216,7 +216,7 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
|
|||||||
}
|
}
|
||||||
|
|
||||||
JEMALLOC_ALWAYS_INLINE int
|
JEMALLOC_ALWAYS_INLINE int
|
||||||
iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
|
iget_defrag_hint(tsdn_t *tsdn, void* ptr) {
|
||||||
int defrag = 0;
|
int defrag = 0;
|
||||||
rtree_ctx_t rtree_ctx_fallback;
|
rtree_ctx_t rtree_ctx_fallback;
|
||||||
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
|
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
|
||||||
@ -232,11 +232,22 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
|
|||||||
malloc_mutex_lock(tsdn, &bin->lock);
|
malloc_mutex_lock(tsdn, &bin->lock);
|
||||||
/* don't bother moving allocations from the slab currently used for new allocations */
|
/* don't bother moving allocations from the slab currently used for new allocations */
|
||||||
if (slab != bin->slabcur) {
|
if (slab != bin->slabcur) {
|
||||||
const bin_info_t *bin_info = &bin_infos[binind];
|
int free_in_slab = extent_nfree_get(slab);
|
||||||
size_t availregs = bin_info->nregs * bin->stats.curslabs;
|
if (free_in_slab) {
|
||||||
*bin_util = ((long long)bin->stats.curregs<<16) / availregs;
|
const bin_info_t *bin_info = &bin_infos[binind];
|
||||||
*run_util = ((long long)(bin_info->nregs - extent_nfree_get(slab))<<16) / bin_info->nregs;
|
int curslabs = bin->stats.curslabs;
|
||||||
defrag = 1;
|
size_t curregs = bin->stats.curregs;
|
||||||
|
if (bin->slabcur) {
|
||||||
|
/* remove slabcur from the overall utilization */
|
||||||
|
curregs -= bin_info->nregs - extent_nfree_get(bin->slabcur);
|
||||||
|
curslabs -= 1;
|
||||||
|
}
|
||||||
|
/* Compare the utilization ratio of the slab in question to the total average,
|
||||||
|
* to avoid precision lost and division, we do that by extrapolating the usage
|
||||||
|
* of the slab as if all slabs have the same usage. If this slab is less used
|
||||||
|
* than the average, we'll prefer to evict the data to hopefully more used ones */
|
||||||
|
defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
malloc_mutex_unlock(tsdn, &bin->lock);
|
malloc_mutex_unlock(tsdn, &bin->lock);
|
||||||
}
|
}
|
||||||
|
8
deps/jemalloc/src/jemalloc.c
vendored
8
deps/jemalloc/src/jemalloc.c
vendored
@ -3326,12 +3326,10 @@ jemalloc_postfork_child(void) {
|
|||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
|
||||||
/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation.
|
/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation.
|
||||||
* returns 0 if the allocation is in the currently active run,
|
* returns 1 if the allocation should be moved, and 0 if the allocation be kept.
|
||||||
* or when it is not causing any frag issue (large or huge bin)
|
|
||||||
* returns the bin utilization and run utilization both in fixed point 16:16.
|
|
||||||
* If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */
|
* If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */
|
||||||
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
||||||
get_defrag_hint(void* ptr, int *bin_util, int *run_util) {
|
get_defrag_hint(void* ptr) {
|
||||||
assert(ptr != NULL);
|
assert(ptr != NULL);
|
||||||
return iget_defrag_hint(TSDN_NULL, ptr, bin_util, run_util);
|
return iget_defrag_hint(TSDN_NULL, ptr);
|
||||||
}
|
}
|
||||||
|
34
src/debug.c
34
src/debug.c
@ -311,6 +311,13 @@ void mallctl_int(client *c, robj **argv, int argc) {
|
|||||||
size_t sz = sizeof(old);
|
size_t sz = sizeof(old);
|
||||||
while (sz > 0) {
|
while (sz > 0) {
|
||||||
if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) {
|
if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) {
|
||||||
|
if (ret == EPERM && argc > 1) {
|
||||||
|
/* if this option is write only, try just writing to it. */
|
||||||
|
if (!(ret=je_mallctl(argv[0]->ptr, NULL, 0, &val, sz))) {
|
||||||
|
addReply(c, shared.ok);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (ret==EINVAL) {
|
if (ret==EINVAL) {
|
||||||
/* size might be wrong, try a smaller one */
|
/* size might be wrong, try a smaller one */
|
||||||
sz /= 2;
|
sz /= 2;
|
||||||
@ -333,17 +340,30 @@ void mallctl_int(client *c, robj **argv, int argc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void mallctl_string(client *c, robj **argv, int argc) {
|
void mallctl_string(client *c, robj **argv, int argc) {
|
||||||
int ret;
|
int rret, wret;
|
||||||
char *old;
|
char *old;
|
||||||
size_t sz = sizeof(old);
|
size_t sz = sizeof(old);
|
||||||
/* for strings, it seems we need to first get the old value, before overriding it. */
|
/* for strings, it seems we need to first get the old value, before overriding it. */
|
||||||
if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
|
if ((rret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
|
||||||
addReplyErrorFormat(c,"%s", strerror(ret));
|
/* return error unless this option is write only. */
|
||||||
return;
|
if (!(rret == EPERM && argc > 1)) {
|
||||||
|
addReplyErrorFormat(c,"%s", strerror(rret));
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
addReplyBulkCString(c, old);
|
if(argc > 1) {
|
||||||
if(argc > 1)
|
char *val = argv[1]->ptr;
|
||||||
je_mallctl(argv[0]->ptr, NULL, 0, &argv[1]->ptr, sizeof(char*));
|
char **valref = &val;
|
||||||
|
if ((!strcmp(val,"VOID")))
|
||||||
|
valref = NULL, sz = 0;
|
||||||
|
wret = je_mallctl(argv[0]->ptr, NULL, 0, valref, sz);
|
||||||
|
}
|
||||||
|
if (!rret)
|
||||||
|
addReplyBulkCString(c, old);
|
||||||
|
else if (wret)
|
||||||
|
addReplyErrorFormat(c,"%s", strerror(wret));
|
||||||
|
else
|
||||||
|
addReply(c, shared.ok);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
13
src/defrag.c
13
src/defrag.c
@ -43,7 +43,7 @@
|
|||||||
|
|
||||||
/* this method was added to jemalloc in order to help us understand which
|
/* this method was added to jemalloc in order to help us understand which
|
||||||
* pointers are worthwhile moving and which aren't */
|
* pointers are worthwhile moving and which aren't */
|
||||||
int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util);
|
int je_get_defrag_hint(void* ptr);
|
||||||
|
|
||||||
/* forward declarations*/
|
/* forward declarations*/
|
||||||
void defragDictBucketCallback(void *privdata, dictEntry **bucketref);
|
void defragDictBucketCallback(void *privdata, dictEntry **bucketref);
|
||||||
@ -55,18 +55,11 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
|
|||||||
* when it returns a non-null value, the old pointer was already released
|
* when it returns a non-null value, the old pointer was already released
|
||||||
* and should NOT be accessed. */
|
* and should NOT be accessed. */
|
||||||
void* activeDefragAlloc(void *ptr) {
|
void* activeDefragAlloc(void *ptr) {
|
||||||
int bin_util, run_util;
|
|
||||||
size_t size;
|
size_t size;
|
||||||
void *newptr;
|
void *newptr;
|
||||||
if(!je_get_defrag_hint(ptr, &bin_util, &run_util)) {
|
if(!je_get_defrag_hint(ptr)) {
|
||||||
server.stat_active_defrag_misses++;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
/* if this run is more utilized than the average utilization in this bin
|
|
||||||
* (or it is full), skip it. This will eventually move all the allocations
|
|
||||||
* from relatively empty runs into relatively full runs. */
|
|
||||||
if (run_util > bin_util || run_util == 1<<16) {
|
|
||||||
server.stat_active_defrag_misses++;
|
server.stat_active_defrag_misses++;
|
||||||
|
size = zmalloc_size(ptr);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
/* move this allocation to a new allocation.
|
/* move this allocation to a new allocation.
|
||||||
|
@ -95,6 +95,10 @@ start_server {tags {"defrag"}} {
|
|||||||
}
|
}
|
||||||
if {$::verbose} {
|
if {$::verbose} {
|
||||||
puts "frag $frag"
|
puts "frag $frag"
|
||||||
|
set misses [s active_defrag_misses]
|
||||||
|
set hits [s active_defrag_hits]
|
||||||
|
puts "hits: $hits"
|
||||||
|
puts "misses: $misses"
|
||||||
puts "max latency $max_latency"
|
puts "max latency $max_latency"
|
||||||
puts [r latency latest]
|
puts [r latency latest]
|
||||||
puts [r latency history active-defrag-cycle]
|
puts [r latency history active-defrag-cycle]
|
||||||
@ -221,6 +225,10 @@ start_server {tags {"defrag"}} {
|
|||||||
}
|
}
|
||||||
if {$::verbose} {
|
if {$::verbose} {
|
||||||
puts "frag $frag"
|
puts "frag $frag"
|
||||||
|
set misses [s active_defrag_misses]
|
||||||
|
set hits [s active_defrag_hits]
|
||||||
|
puts "hits: $hits"
|
||||||
|
puts "misses: $misses"
|
||||||
puts "max latency $max_latency"
|
puts "max latency $max_latency"
|
||||||
puts [r latency latest]
|
puts [r latency latest]
|
||||||
puts [r latency history active-defrag-cycle]
|
puts [r latency history active-defrag-cycle]
|
||||||
@ -256,11 +264,12 @@ start_server {tags {"defrag"}} {
|
|||||||
set expected_frag 1.7
|
set expected_frag 1.7
|
||||||
# add a mass of list nodes to two lists (allocations are interlaced)
|
# add a mass of list nodes to two lists (allocations are interlaced)
|
||||||
set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
|
set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
|
||||||
for {set j 0} {$j < 500000} {incr j} {
|
set elements 500000
|
||||||
|
for {set j 0} {$j < $elements} {incr j} {
|
||||||
$rd lpush biglist1 $val
|
$rd lpush biglist1 $val
|
||||||
$rd lpush biglist2 $val
|
$rd lpush biglist2 $val
|
||||||
}
|
}
|
||||||
for {set j 0} {$j < 500000} {incr j} {
|
for {set j 0} {$j < $elements} {incr j} {
|
||||||
$rd read ; # Discard replies
|
$rd read ; # Discard replies
|
||||||
$rd read ; # Discard replies
|
$rd read ; # Discard replies
|
||||||
}
|
}
|
||||||
@ -302,6 +311,8 @@ start_server {tags {"defrag"}} {
|
|||||||
|
|
||||||
# test the the fragmentation is lower
|
# test the the fragmentation is lower
|
||||||
after 120 ;# serverCron only updates the info once in 100ms
|
after 120 ;# serverCron only updates the info once in 100ms
|
||||||
|
set misses [s active_defrag_misses]
|
||||||
|
set hits [s active_defrag_hits]
|
||||||
set frag [s allocator_frag_ratio]
|
set frag [s allocator_frag_ratio]
|
||||||
set max_latency 0
|
set max_latency 0
|
||||||
foreach event [r latency latest] {
|
foreach event [r latency latest] {
|
||||||
@ -312,6 +323,8 @@ start_server {tags {"defrag"}} {
|
|||||||
}
|
}
|
||||||
if {$::verbose} {
|
if {$::verbose} {
|
||||||
puts "frag $frag"
|
puts "frag $frag"
|
||||||
|
puts "misses: $misses"
|
||||||
|
puts "hits: $hits"
|
||||||
puts "max latency $max_latency"
|
puts "max latency $max_latency"
|
||||||
puts [r latency latest]
|
puts [r latency latest]
|
||||||
puts [r latency history active-defrag-cycle]
|
puts [r latency history active-defrag-cycle]
|
||||||
@ -320,6 +333,10 @@ start_server {tags {"defrag"}} {
|
|||||||
# due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
|
# due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
|
||||||
# we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
|
# we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
|
||||||
assert {$max_latency <= 30}
|
assert {$max_latency <= 30}
|
||||||
|
|
||||||
|
# in extreme cases of stagnation, we see over 20m misses before the tests aborts with "defrag didn't stop",
|
||||||
|
# in normal cases we only see 100k misses out of 500k elements
|
||||||
|
assert {$misses < $elements}
|
||||||
}
|
}
|
||||||
# verify the data isn't corrupted or changed
|
# verify the data isn't corrupted or changed
|
||||||
set newdigest [r debug digest]
|
set newdigest [r debug digest]
|
||||||
@ -327,6 +344,110 @@ start_server {tags {"defrag"}} {
|
|||||||
r save ;# saving an rdb iterates over all the data / pointers
|
r save ;# saving an rdb iterates over all the data / pointers
|
||||||
r del biglist1 ;# coverage for quicklistBookmarksClear
|
r del biglist1 ;# coverage for quicklistBookmarksClear
|
||||||
} {1}
|
} {1}
|
||||||
|
|
||||||
|
test "Active defrag edge case" {
|
||||||
|
# there was an edge case in defrag where all the slabs of a certain bin are exact the same
|
||||||
|
# % utilization, with the exception of the current slab from which new allocations are made
|
||||||
|
# if the current slab is lower in utilization the defragger would have ended up in stagnation,
|
||||||
|
# keept running and not move any allocation.
|
||||||
|
# this test is more consistent on a fresh server with no history
|
||||||
|
start_server {tags {"defrag"}} {
|
||||||
|
r flushdb
|
||||||
|
r config resetstat
|
||||||
|
r config set save "" ;# prevent bgsave from interfereing with save below
|
||||||
|
r config set hz 100
|
||||||
|
r config set activedefrag no
|
||||||
|
r config set active-defrag-max-scan-fields 1000
|
||||||
|
r config set active-defrag-threshold-lower 5
|
||||||
|
r config set active-defrag-cycle-min 65
|
||||||
|
r config set active-defrag-cycle-max 75
|
||||||
|
r config set active-defrag-ignore-bytes 1mb
|
||||||
|
r config set maxmemory 0
|
||||||
|
set expected_frag 1.3
|
||||||
|
|
||||||
|
r debug mallctl-str thread.tcache.flush VOID
|
||||||
|
# fill the first slab containin 32 regs of 640 bytes.
|
||||||
|
for {set j 0} {$j < 32} {incr j} {
|
||||||
|
r setrange "_$j" 600 x
|
||||||
|
r debug mallctl-str thread.tcache.flush VOID
|
||||||
|
}
|
||||||
|
|
||||||
|
# add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab.
|
||||||
|
set rd [redis_deferring_client]
|
||||||
|
set keys 640000
|
||||||
|
for {set j 0} {$j < $keys} {incr j} {
|
||||||
|
$rd setrange $j 600 x
|
||||||
|
}
|
||||||
|
for {set j 0} {$j < $keys} {incr j} {
|
||||||
|
$rd read ; # Discard replies
|
||||||
|
}
|
||||||
|
|
||||||
|
# create some fragmentation of 50%
|
||||||
|
set sent 0
|
||||||
|
for {set j 0} {$j < $keys} {incr j 1} {
|
||||||
|
$rd del $j
|
||||||
|
incr sent
|
||||||
|
incr j 1
|
||||||
|
}
|
||||||
|
for {set j 0} {$j < $sent} {incr j} {
|
||||||
|
$rd read ; # Discard replies
|
||||||
|
}
|
||||||
|
|
||||||
|
# create higher fragmentation in the first slab
|
||||||
|
for {set j 10} {$j < 32} {incr j} {
|
||||||
|
r del "_$j"
|
||||||
|
}
|
||||||
|
|
||||||
|
# start defrag
|
||||||
|
after 120 ;# serverCron only updates the info once in 100ms
|
||||||
|
set frag [s allocator_frag_ratio]
|
||||||
|
if {$::verbose} {
|
||||||
|
puts "frag $frag"
|
||||||
|
}
|
||||||
|
|
||||||
|
assert {$frag >= $expected_frag}
|
||||||
|
|
||||||
|
set digest [r debug digest]
|
||||||
|
catch {r config set activedefrag yes} e
|
||||||
|
if {![string match {DISABLED*} $e]} {
|
||||||
|
# wait for the active defrag to start working (decision once a second)
|
||||||
|
wait_for_condition 50 100 {
|
||||||
|
[s active_defrag_running] ne 0
|
||||||
|
} else {
|
||||||
|
fail "defrag not started."
|
||||||
|
}
|
||||||
|
|
||||||
|
# wait for the active defrag to stop working
|
||||||
|
wait_for_condition 500 100 {
|
||||||
|
[s active_defrag_running] eq 0
|
||||||
|
} else {
|
||||||
|
after 120 ;# serverCron only updates the info once in 100ms
|
||||||
|
puts [r info memory]
|
||||||
|
puts [r info stats]
|
||||||
|
puts [r memory malloc-stats]
|
||||||
|
fail "defrag didn't stop."
|
||||||
|
}
|
||||||
|
|
||||||
|
# test the the fragmentation is lower
|
||||||
|
after 120 ;# serverCron only updates the info once in 100ms
|
||||||
|
set misses [s active_defrag_misses]
|
||||||
|
set hits [s active_defrag_hits]
|
||||||
|
set frag [s allocator_frag_ratio]
|
||||||
|
if {$::verbose} {
|
||||||
|
puts "frag $frag"
|
||||||
|
puts "hits: $hits"
|
||||||
|
puts "misses: $misses"
|
||||||
|
}
|
||||||
|
assert {$frag < 1.1}
|
||||||
|
assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses
|
||||||
|
}
|
||||||
|
|
||||||
|
# verify the data isn't corrupted or changed
|
||||||
|
set newdigest [r debug digest]
|
||||||
|
assert {$digest eq $newdigest}
|
||||||
|
r save ;# saving an rdb iterates over all the data / pointers
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} ;# run_solo
|
} ;# run_solo
|
||||||
|
Loading…
Reference in New Issue
Block a user