Defragger improvements around large bins (#12996)

Implement #12963

## Changes
1. large bins don't have external fragmentation or are at least
non-defraggable, so we should ignore the effect of
large bins when measuring fragmentation, and only measure fragmentation
of small bins. this affects both the allocator_frag* metrics and also
the active-defrag trigger
2. Adding INFO metrics for `muzzy` memory, which is memory returned to
the OS but still shows as RSS until the OS reclaims it.

---------

Co-authored-by: Oran Agra <oran@redislabs.com>
This commit is contained in:
debing.sun 2024-02-21 00:11:09 +08:00 committed by GitHub
parent ca5cac998e
commit f6785df663
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 99 additions and 18 deletions

View File

@ -780,17 +780,21 @@ void activeDefragKvstore(kvstore *kvs) {
* or not, a false detection can cause the defragmenter to waste a lot of CPU * or not, a false detection can cause the defragmenter to waste a lot of CPU
* without the possibility of getting any results. */ * without the possibility of getting any results. */
float getAllocatorFragmentation(size_t *out_frag_bytes) { float getAllocatorFragmentation(size_t *out_frag_bytes) {
size_t resident, active, allocated; size_t resident, active, allocated, frag_smallbins_bytes;
zmalloc_get_allocator_info(&allocated, &active, &resident); zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes);
float frag_pct = ((float)active / allocated)*100 - 100;
size_t frag_bytes = active - allocated; /* Calculate the fragmentation ratio as the proportion of wasted memory in small
* bins (which are defraggable) relative to the total allocated memory (including large bins).
* This is because otherwise, if most of the memory usage is large bins, we may show high percentage,
* despite the fact it's not a lot of memory for the user. */
float frag_pct = (float)frag_smallbins_bytes / allocated * 100;
float rss_pct = ((float)resident / allocated)*100 - 100; float rss_pct = ((float)resident / allocated)*100 - 100;
size_t rss_bytes = resident - allocated; size_t rss_bytes = resident - allocated;
if(out_frag_bytes) if(out_frag_bytes)
*out_frag_bytes = frag_bytes; *out_frag_bytes = frag_smallbins_bytes;
serverLog(LL_DEBUG, serverLog(LL_DEBUG,
"allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)",
allocated, active, resident, frag_pct, rss_pct, frag_bytes, rss_bytes); allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes);
return frag_pct; return frag_pct;
} }

View File

@ -1184,9 +1184,9 @@ struct redisMemOverhead *getMemoryOverheadData(void) {
mh->total_frag_bytes = mh->total_frag_bytes =
server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used; server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used;
mh->allocator_frag = mh->allocator_frag =
(float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated; (float)server.cron_malloc_stats.allocator_frag_smallbins_bytes / server.cron_malloc_stats.allocator_allocated + 1;
mh->allocator_frag_bytes = mh->allocator_frag_bytes =
server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated; server.cron_malloc_stats.allocator_frag_smallbins_bytes;
mh->allocator_rss = mh->allocator_rss =
(float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active; (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active;
mh->allocator_rss_bytes = mh->allocator_rss_bytes =
@ -1556,7 +1556,7 @@ NULL
} else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) { } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) {
struct redisMemOverhead *mh = getMemoryOverheadData(); struct redisMemOverhead *mh = getMemoryOverheadData();
addReplyMapLen(c,27+mh->num_dbs); addReplyMapLen(c,28+mh->num_dbs);
addReplyBulkCString(c,"peak.allocated"); addReplyBulkCString(c,"peak.allocated");
addReplyLongLong(c,mh->peak_allocated); addReplyLongLong(c,mh->peak_allocated);
@ -1628,6 +1628,9 @@ NULL
addReplyBulkCString(c,"allocator.resident"); addReplyBulkCString(c,"allocator.resident");
addReplyLongLong(c,server.cron_malloc_stats.allocator_resident); addReplyLongLong(c,server.cron_malloc_stats.allocator_resident);
addReplyBulkCString(c,"allocator.muzzy");
addReplyLongLong(c,server.cron_malloc_stats.allocator_muzzy);
addReplyBulkCString(c,"allocator-fragmentation.ratio"); addReplyBulkCString(c,"allocator-fragmentation.ratio");
addReplyDouble(c,mh->allocator_frag); addReplyDouble(c,mh->allocator_frag);

View File

@ -1228,7 +1228,10 @@ void cronUpdateMemoryStats(void) {
* allocations, and allocator reserved pages that can be pursed (all not actual frag) */ * allocations, and allocator reserved pages that can be pursed (all not actual frag) */
zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated, zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated,
&server.cron_malloc_stats.allocator_active, &server.cron_malloc_stats.allocator_active,
&server.cron_malloc_stats.allocator_resident); &server.cron_malloc_stats.allocator_resident,
NULL,
&server.cron_malloc_stats.allocator_muzzy,
&server.cron_malloc_stats.allocator_frag_smallbins_bytes);
/* in case the allocator isn't providing these stats, fake them so that /* in case the allocator isn't providing these stats, fake them so that
* fragmentation info still shows some (inaccurate metrics) */ * fragmentation info still shows some (inaccurate metrics) */
if (!server.cron_malloc_stats.allocator_resident) { if (!server.cron_malloc_stats.allocator_resident) {
@ -5643,6 +5646,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) {
"allocator_allocated:%zu\r\n", server.cron_malloc_stats.allocator_allocated, "allocator_allocated:%zu\r\n", server.cron_malloc_stats.allocator_allocated,
"allocator_active:%zu\r\n", server.cron_malloc_stats.allocator_active, "allocator_active:%zu\r\n", server.cron_malloc_stats.allocator_active,
"allocator_resident:%zu\r\n", server.cron_malloc_stats.allocator_resident, "allocator_resident:%zu\r\n", server.cron_malloc_stats.allocator_resident,
"allocator_muzzy:%zu\r\n", server.cron_malloc_stats.allocator_muzzy,
"total_system_memory:%lu\r\n", (unsigned long)total_system_mem, "total_system_memory:%lu\r\n", (unsigned long)total_system_mem,
"total_system_memory_human:%s\r\n", total_system_hmem, "total_system_memory_human:%s\r\n", total_system_hmem,
"used_memory_lua:%lld\r\n", memory_lua, /* deprecated, renamed to used_memory_vm_eval */ "used_memory_lua:%lld\r\n", memory_lua, /* deprecated, renamed to used_memory_vm_eval */

View File

@ -1464,6 +1464,8 @@ struct malloc_stats {
size_t allocator_allocated; size_t allocator_allocated;
size_t allocator_active; size_t allocator_active;
size_t allocator_resident; size_t allocator_resident;
size_t allocator_muzzy;
size_t allocator_frag_smallbins_bytes;
}; };
/*----------------------------------------------------------------------------- /*-----------------------------------------------------------------------------

View File

@ -626,9 +626,54 @@ size_t zmalloc_get_rss(void) {
#if defined(USE_JEMALLOC) #if defined(USE_JEMALLOC)
int zmalloc_get_allocator_info(size_t *allocated, #include <assert.h>
size_t *active,
size_t *resident) { #define STRINGIFY_(x) #x
#define STRINGIFY(x) STRINGIFY_(x)
/* Compute the total memory wasted in fragmentation of inside small arena bins.
* Done by summing the memory in unused regs in all slabs of all small bins. */
size_t zmalloc_get_frag_smallbins(void) {
unsigned nbins;
size_t sz, frag = 0;
char buf[100];
sz = sizeof(unsigned);
assert(!je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0));
for (unsigned j = 0; j < nbins; j++) {
size_t curregs, curslabs, reg_size;
uint32_t nregs;
/* The size of the current bin */
snprintf(buf, sizeof(buf), "arenas.bin.%d.size", j);
sz = sizeof(size_t);
assert(!je_mallctl(buf, &reg_size, &sz, NULL, 0));
/* Number of used regions in the bin */
snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curregs", j);
sz = sizeof(size_t);
assert(!je_mallctl(buf, &curregs, &sz, NULL, 0));
/* Number of regions per slab */
snprintf(buf, sizeof(buf), "arenas.bin.%d.nregs", j);
sz = sizeof(uint32_t);
assert(!je_mallctl(buf, &nregs, &sz, NULL, 0));
/* Number of current slabs in the bin */
snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curslabs", j);
sz = sizeof(size_t);
assert(!je_mallctl(buf, &curslabs, &sz, NULL, 0));
/* Calculate the fragmentation bytes for the current bin and add it to the total. */
frag += ((nregs * curslabs) - curregs) * reg_size;
}
return frag;
}
int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident,
size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes)
{
uint64_t epoch = 1; uint64_t epoch = 1;
size_t sz; size_t sz;
*allocated = *resident = *active = 0; *allocated = *resident = *active = 0;
@ -645,6 +690,26 @@ int zmalloc_get_allocator_info(size_t *allocated,
/* Unlike zmalloc_used_memory, this matches the stats.resident by taking /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
* into account all allocations done by this process (not only zmalloc). */ * into account all allocations done by this process (not only zmalloc). */
je_mallctl("stats.allocated", allocated, &sz, NULL, 0); je_mallctl("stats.allocated", allocated, &sz, NULL, 0);
/* Retained memory is memory released by `madvised(..., MADV_DONTNEED)`, which is not part
* of RSS or mapped memory, and doesn't have a strong association with physical memory in the OS.
* It is still part of the VM-Size, and may be used again in later allocations. */
if (retained) {
*retained = 0;
je_mallctl("stats.retained", retained, &sz, NULL, 0);
}
/* Unlike retained, Muzzy representats memory released with `madvised(..., MADV_FREE)`.
* These pages will show as RSS for the process, until the OS decides to re-use them. */
if (muzzy) {
size_t pmuzzy, page;
assert(!je_mallctl("stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".pmuzzy", &pmuzzy, &sz, NULL, 0));
assert(!je_mallctl("arenas.page", &page, &sz, NULL, 0));
*muzzy = pmuzzy * page;
}
/* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). */
*frag_smallbins_bytes = zmalloc_get_frag_smallbins();
return 1; return 1;
} }
@ -670,10 +735,12 @@ int jemalloc_purge(void) {
#else #else
int zmalloc_get_allocator_info(size_t *allocated, int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident,
size_t *active, size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes)
size_t *resident) { {
*allocated = *resident = *active = 0; *allocated = *resident = *active = *frag_smallbins_bytes = 0;
if (retained) *retained = 0;
if (muzzy) *muzzy = 0;
return 1; return 1;
} }

View File

@ -122,7 +122,8 @@ __attribute__((malloc)) char *zstrdup(const char *s);
size_t zmalloc_used_memory(void); size_t zmalloc_used_memory(void);
void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
size_t zmalloc_get_rss(void); size_t zmalloc_get_rss(void);
int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident); int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident,
size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes);
void set_jemalloc_bg_thread(int enable); void set_jemalloc_bg_thread(int enable);
int jemalloc_purge(void); int jemalloc_purge(void);
size_t zmalloc_get_private_dirty(long pid); size_t zmalloc_get_private_dirty(long pid);