[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20240829102039.3455842-2-hezhongkun.hzk@bytedance.com>
Date: Thu, 29 Aug 2024 18:20:39 +0800
From: Zhongkun He <hezhongkun.hzk@...edance.com>
To: akpm@...ux-foundation.org,
hannes@...xchg.org,
mhocko@...nel.org
Cc: roman.gushchin@...ux.dev,
shakeel.butt@...ux.dev,
muchun.song@...ux.dev,
lizefan.x@...edance.com,
linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
cgroups@...r.kernel.org,
Zhongkun He <hezhongkun.hzk@...edance.com>
Subject: [RFC PATCH 2/2] mm: memcg: add disbale_unmap_file arg to memory.reclaim
Allow proactively memory reclaimers to submit an additional
disbale_unmap_file argument to memory.reclaim. This will
skip the mapped file for that reclaim attempt.
For example:
echo "2M disable_unmap_file" > /sys/fs/cgroup/test/memory.reclaim
will perform reclaim on the test cgroup with no mapped file page.
The memory.reclaim is a useful interface. We can carry out proactive
memory reclaim in the user space, which can increase the utilization
rate of memory. In the actual usage scenarios, we found that when
there are sufficient anonymous pages, mapped file pages with a
relatively small proportion would still be reclaimed. This is likely
to cause an increase in refaults and an increase in task delay,
because mapped file pages usually include important executable codes,
data, and shared libraries, etc. According to the verified situation,
if we can skip this part of the memory, the business delay will be reduced.
Even if there are sufficient anonymous pages and a small number of
page cache and mapped file pages, mapped file pages will still be reclaimed.
Here is an example of anonymous pages being sufficient but mapped
file pages still being reclaimed:
cat memory.stat | grep -wE 'anon|file|file_mapped'
anon 3406462976
file 332967936
file_mapped 300302336
echo 1g > memory.reclaim swappiness=200 > memory.reclaim
cat memory.stat | grep -wE 'anon|file|file_mapped'
anon 2613276672
file 52523008
file_mapped 30982144
echo 1g > memory.reclaim swappiness=200 > memory.reclaim
cat memory.stat | grep -wE 'anon|file|file_mapped'
anon 1552130048
file 39759872
file_mapped 20299776
With this patch, the file_mapped pages will be skiped.
echo 1g > memory.reclaim swappiness=200 disable_unmap_file > memory.reclaim
cat memory.stat | grep -wE 'anon|file|file_mapped'
anon 480059392
file 37978112
file_mapped 20299776
IMO,it is difficult to balance the priorities of various pages in the kernel,
there are too many scenarios to consider. However, for the scenario of proactive
memory reclaim in user space, we can make a simple judgment in this case.
Signed-off-by: Zhongkun He <hezhongkun.hzk@...edance.com>
---
include/linux/swap.h | 1 +
mm/memcontrol.c | 9 +++++++--
mm/vmscan.c | 4 ++++
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca533b478c21..49df8f3748e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -409,6 +409,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MEMCG_RECLAIM_DIS_UNMAP_FILE (1 << 3)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 35431035e782..7b0181553b0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4282,11 +4282,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
enum {
MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_DISABLE_UNMAP_FILE,
MEMORY_RECLAIM_NULL,
};
static const match_table_t tokens = {
{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_DISABLE_UNMAP_FILE, "disable_unmap_file"},
{ MEMORY_RECLAIM_NULL, NULL },
};
@@ -4297,7 +4299,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
int swappiness = -1;
- unsigned int reclaim_options;
+ unsigned int reclaim_options = 0;
char *old_buf, *start;
substring_t args[MAX_OPT_ARGS];
@@ -4320,12 +4322,15 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
return -EINVAL;
break;
+ case MEMORY_RECLAIM_DISABLE_UNMAP_FILE:
+ reclaim_options = MEMCG_RECLAIM_DIS_UNMAP_FILE;
+ break;
default:
return -EINVAL;
}
}
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+ reclaim_options |= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
/* Will converge on zero, but reclaim enforces a minimum */
unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 50ac714cba2f..1b58126a8246 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6609,6 +6609,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
};
+
+ if (reclaim_options & MEMCG_RECLAIM_DIS_UNMAP_FILE)
+ sc.may_unmap &= ~UNMAP_FILE;
+
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
* equal pressure on all the nodes. This is based on the assumption that
--
2.20.1
Powered by blists - more mailing lists