[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220512044634.63586-6-ligang.bdlg@bytedance.com>
Date: Thu, 12 May 2022 12:46:34 +0800
From: Gang Li <ligang.bdlg@...edance.com>
To: akpm@...ux-foundation.org
Cc: songmuchun@...edance.com, hca@...ux.ibm.com, gor@...ux.ibm.com,
agordeev@...ux.ibm.com, borntraeger@...ux.ibm.com,
svens@...ux.ibm.com, ebiederm@...ssion.com, keescook@...omium.org,
viro@...iv.linux.org.uk, rostedt@...dmis.org, mingo@...hat.com,
peterz@...radead.org, acme@...nel.org, mark.rutland@....com,
alexander.shishkin@...ux.intel.com, jolsa@...nel.org,
namhyung@...nel.org, david@...hat.com, imbrenda@...ux.ibm.com,
apopple@...dia.com, adobriyan@...il.com,
stephen.s.brennan@...cle.com, ohoono.kwon@...sung.com,
haolee.swjtu@...il.com, kaleshsingh@...gle.com,
zhengqi.arch@...edance.com, peterx@...hat.com, shy828301@...il.com,
surenb@...gle.com, ccross@...gle.com, vincent.whitchurch@...s.com,
tglx@...utronix.de, bigeasy@...utronix.de, fenghua.yu@...el.com,
linux-s390@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
linux-perf-users@...r.kernel.org,
Gang Li <ligang.bdlg@...edance.com>
Subject: [PATCH 5/5 v1] mm, oom: enable per numa node oom for CONSTRAINT_MEMORY_POLICY
Page allocator will only alloc pages on node indicated by
`nodemask`. But oom will still select bad process by total rss usage
which may reclam nothing on the node indicated by `nodemask`.
This patch let oom only calculate rss on the given node when
oc->constraint equals to CONSTRAINT_MEMORY_POLICY.
If `nodemask` is asigned, the process with the highest memory
consumption on the specific node will be killed. oom_kill dmesg will
looks like this:
```
[ 1471.436027] Tasks state (memory values in pages):
[ 1471.438518] [ pid ] uid tgid total_vm rss (01)nrss pgtables_bytes swapents oom_score_adj name
[ 1471.554703] [ 1011] 0 1011 220005 8589 1872 823296 0 0 node
[ 1471.707912] [ 12399] 0 12399 1311306 1311056 262170 10534912 0 0 a.out
[ 1471.712429] [ 13135] 0 13135 787018 674666 674300 5439488 0 0 a.out
[ 1471.721506] [ 13295] 0 13295 597 188 0 24576 0 0 sh
[ 1471.734600] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=1,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/user.slice/user-0.slice/session-3.scope,task=a.out,pid=13135,uid=0
[ 1471.742583] Out of memory: Killed process 13135 (a.out) total-vm:3148072kB, anon-rss:2697304kB, file-rss:1360kB, shmem-rss:0kB, UID:0 pgtables:5312kB oom_score_adj:0
[ 1471.849615] oom_reaper: reaped process 13135 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
```
Signed-off-by: Gang Li <ligang.bdlg@...edance.com>
---
fs/proc/base.c | 6 +++++-
include/linux/oom.h | 2 +-
mm/oom_kill.c | 45 +++++++++++++++++++++++++++++++++++++--------
3 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..caf0f51284d0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -552,8 +552,12 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
long badness;
+ struct oom_control oc = {
+ .totalpages = totalpages,
+ .gfp_mask = 0,
+ };
- badness = oom_badness(task, totalpages);
+ badness = oom_badness(task, &oc);
/*
* Special case OOM_SCORE_ADJ_MIN for all others scale the
* badness value into [0, 2000] range which we have been
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..0cb6a60be776 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -109,7 +109,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
bool __oom_reap_task_mm(struct mm_struct *mm);
long oom_badness(struct task_struct *p,
- unsigned long totalpages);
+ struct oom_control *oc);
extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 757f5665ae94..75a80b5a63bf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -198,7 +198,7 @@ static bool should_dump_unreclaim_slab(void)
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, struct oom_control *oc)
{
long points;
long adj;
@@ -227,12 +227,22 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
- mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+ points = get_mm_counter(p->mm, -1, nid_to_find_victim) +
+ get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ } else {
+ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ }
task_unlock(p);
/* Normalize to oom_score_adj units */
- adj *= totalpages / 1000;
+ adj *= oc->totalpages / 1000;
points += adj;
return points;
@@ -338,7 +348,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto select;
}
- points = oom_badness(task, oc->totalpages);
+ points = oom_badness(task, oc);
if (points == LONG_MIN || points < oc->chosen_points)
goto next;
@@ -382,6 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
{
struct oom_control *oc = arg;
struct task_struct *task;
+ unsigned long node_mm_rss;
if (oom_unkillable_task(p))
return 0;
@@ -399,9 +410,18 @@ static int dump_task(struct task_struct *p, void *arg)
return 0;
}
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+ node_mm_rss = get_mm_counter(p->mm, -1, nid_to_find_victim);
+ } else {
+ node_mm_rss = 0;
+ }
+ pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm), node_mm_rss,
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE),
task->signal->oom_score_adj, task->comm);
@@ -422,8 +442,17 @@ static int dump_task(struct task_struct *p, void *arg)
*/
static void dump_tasks(struct oom_control *oc)
{
+ int nid_to_find_victim;
+
+ if (oc->nodemask) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ nid_to_find_victim = zone_to_nid(zoneref->zone);
+ } else {
+ nid_to_find_victim = -1;
+ }
pr_info("Tasks state (memory values in pages):\n");
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss (%02d)nrss pgtables_bytes swapents oom_score_adj name\n", nid_to_find_victim);
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
--
2.20.1
Powered by blists - more mailing lists