[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87bk405akl.fsf@yhuang6-desk2.ccr.corp.intel.com>
Date: Mon, 17 Jun 2024 14:48:26 +0800
From: "Huang, Ying" <ying.huang@...el.com>
To: Barry Song <21cnbao@...il.com>
Cc: akpm@...ux-foundation.org, chrisl@...nel.org, baohua@...nel.org,
kaleshsingh@...gle.com, kasong@...cent.com,
linux-kernel@...r.kernel.org, linux-mm@...ck.org, ryan.roberts@....com
Subject: Re: [PATCH v2 0/2] mm: swap: mTHP swap allocator base on swap
cluster order
Hi, Barry,
Barry Song <21cnbao@...il.com> writes:
> On Sat, Jun 15, 2024 at 2:59 PM Andrew Morton <akpm@...ux-foundation.org> wrote:
>>
>> On Fri, 14 Jun 2024 19:51:11 -0700 Chris Li <chrisl@...nel.org> wrote:
>>
>> > > I'm having trouble understanding the overall impact of this on users.
>> > > We fail the mTHP swap allocation and fall back, but things continue to
>> > > operate OK?
>> >
>> > Continue to operate OK in the sense that the mTHP will have to split
>> > into 4K pages before the swap out, aka the fall back. The swap out and
>> > swap in can continue to work as 4K pages, not as the mTHP. Due to the
>> > fallback, the mTHP based zsmalloc compression with 64K buffer will not
>> > happen. That is the effect of the fallback. But mTHP swap out and swap
>> > in is relatively new, it is not really a regression.
>>
>> Sure, but it's pretty bad to merge a new feature only to have it
>> ineffective after a few hours use.
>>
>> > >
>> > > > There is some test number in the V1 thread of this series:
>> > > > https://lore.kernel.org/r/20240524-swap-allocator-v1-0-47861b423b26@kernel.org
>> > >
>> > > Well, please let's get the latest numbers into the latest patchset.
>> > > Along with a higher-level (and quantitative) description of the user impact.
>> >
>> > I will need Barray's help to collect the number. I don't have the
>> > setup to reproduce his test result.
>> > Maybe a follow up commit message amendment for the test number when I get it?
>
> Although the issue may seem complex at a systemic level, even a small program can
> demonstrate the problem and highlight how Chris's patch has improved the
> situation.
>
> To demonstrate this, I designed a basic test program that maximally allocates
> two memory blocks:
>
> * A memory block of up to 60MB, recommended for HUGEPAGE usage
> * A memory block of up to 1MB, recommended for NOHUGEPAGE usage
>
> In the system configuration, I enabled 64KB mTHP and 64MB zRAM, providing more than
> enough space for both the 60MB and 1MB allocations in the worst case. This setup
> allows us to assess two effects:
>
> 1. When we don't enable mem2 (small folios), we consistently allocate and free
> swap slots aligned with 64KB. whether there is a risk of failure to obtain
> swap slots even though the zRAM has sufficient free space?
> 2. When we enable mem2 (small folios), the presence of small folios may lead
> to fragmentation of clusters, potentially impacting the swapout process for
> large folios negatively.
>
> (2) can be enabled by "-s", without -s, small folios are disabled.
>
> The script to configure zRAM and mTHP:
>
> echo lzo > /sys/block/zram0/comp_algorithm
> echo 64M > /sys/block/zram0/disksize
> echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> mkswap /dev/zram0
> swapon /dev/zram0
>
> The test program I made today after receiving Chris' patchset v2
>
> (Andrew, Please let me know if you want this small test program to
> be committed into kernel/tools/ folder. If yes, please let me know,
> and I will cleanup and prepare a patch):
>
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <string.h>
> #include <sys/mman.h>
> #include <errno.h>
> #include <time.h>
>
> #define MEMSIZE_MTHP (60 * 1024 * 1024)
> #define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> #define ALIGNMENT_MTHP (64 * 1024)
> #define ALIGNMENT_SMALLFOLIO (4 * 1024)
> #define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> #define TOTAL_DONTNEED_SMALLFOLIO (256 * 1024)
> #define MTHP_FOLIO_SIZE (64 * 1024)
>
> #define SWPOUT_PATH \
> "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> #define SWPOUT_FALLBACK_PATH \
> "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
>
> static void *aligned_alloc_mem(size_t size, size_t alignment)
> {
> void *mem = NULL;
> if (posix_memalign(&mem, alignment, size) != 0) {
> perror("posix_memalign");
> return NULL;
> }
> return mem;
> }
>
> static void random_madvise_dontneed(void *mem, size_t mem_size,
> size_t align_size, size_t total_dontneed_size)
> {
> size_t num_pages = total_dontneed_size / align_size;
> size_t i;
> size_t offset;
> void *addr;
>
> for (i = 0; i < num_pages; ++i) {
> offset = (rand() % (mem_size / align_size)) * align_size;
> addr = (char *)mem + offset;
> if (madvise(addr, align_size, MADV_DONTNEED) != 0) {
> perror("madvise dontneed");
> }
> memset(addr, 0x11, align_size);
> }
> }
>
> static unsigned long read_stat(const char *path)
> {
> FILE *file;
> unsigned long value;
>
> file = fopen(path, "r");
> if (!file) {
> perror("fopen");
> return 0;
> }
>
> if (fscanf(file, "%lu", &value) != 1) {
> perror("fscanf");
> fclose(file);
> return 0;
> }
>
> fclose(file);
> return value;
> }
>
> int main(int argc, char *argv[])
> {
> int use_small_folio = 0;
> int i;
> void *mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
> if (mem1 == NULL) {
> fprintf(stderr, "Failed to allocate 60MB memory\n");
> return EXIT_FAILURE;
> }
>
> if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
> perror("madvise hugepage for mem1");
> free(mem1);
> return EXIT_FAILURE;
> }
>
> for (i = 1; i < argc; ++i) {
> if (strcmp(argv[i], "-s") == 0) {
> use_small_folio = 1;
> }
> }
>
> void *mem2 = NULL;
> if (use_small_folio) {
> mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
> if (mem2 == NULL) {
> fprintf(stderr, "Failed to allocate 1MB memory\n");
> free(mem1);
> return EXIT_FAILURE;
> }
>
> if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
> perror("madvise nohugepage for mem2");
> free(mem1);
> free(mem2);
> return EXIT_FAILURE;
> }
> }
>
> for (i = 0; i < 100; ++i) {
> unsigned long initial_swpout;
> unsigned long initial_swpout_fallback;
> unsigned long final_swpout;
> unsigned long final_swpout_fallback;
> unsigned long swpout_inc;
> unsigned long swpout_fallback_inc;
> double fallback_percentage;
>
> initial_swpout = read_stat(SWPOUT_PATH);
> initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
>
> random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
> TOTAL_DONTNEED_MTHP);
>
> if (use_small_folio) {
> random_madvise_dontneed(mem2, MEMSIZE_SMALLFOLIO,
> ALIGNMENT_SMALLFOLIO,
> TOTAL_DONTNEED_SMALLFOLIO);
> }
>
> if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
> perror("madvise pageout for mem1");
> free(mem1);
> if (mem2 != NULL) {
> free(mem2);
> }
> return EXIT_FAILURE;
> }
>
> if (use_small_folio) {
> if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
> perror("madvise pageout for mem2");
> free(mem1);
> free(mem2);
> return EXIT_FAILURE;
> }
> }
>
> final_swpout = read_stat(SWPOUT_PATH);
> final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
>
> swpout_inc = final_swpout - initial_swpout;
> swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
>
> fallback_percentage = (double)swpout_fallback_inc /
> (swpout_fallback_inc + swpout_inc) * 100;
>
> printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
> i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
> }
>
> free(mem1);
> if (mem2 != NULL) {
> free(mem2);
> }
>
> return EXIT_SUCCESS;
> }
Thank you very for your effort to write this test program.
TBH, personally, I thought that this test program isn't practical
enough. Can we show performance difference with some normal workloads?
[snip]
--
Best Regards,
Huang, Ying
Powered by blists - more mailing lists