linux-kernel - [RFC][PATCH] oom_kill: avoid depends on total_vm and use real RSS/swap value for oom

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20091027164526.da6a23cb.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Tue, 27 Oct 2009 16:45:26 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	Minchan Kim <minchan.kim@...il.com>
Cc:	KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>,
	vedran.furac@...il.com, linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	"hugh.dickins@...cali.co.uk" <hugh.dickins@...cali.co.uk>,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	rientjes@...gle.com
Subject: [RFC][PATCH] oom_kill: avoid depends on total_vm and use real
 RSS/swap value for oom_score (Re: Memory overcommit

On Tue, 27 Oct 2009 15:55:26 +0900
Minchan Kim <minchan.kim@...il.com> wrote:

> >> Hmm.
> >> I wonder why we consider VM size for OOM kiling.
> >> How about RSS size?
> >>
> >
> > Maybe the current code assumes "Tons of swap have been generated, already" if
> > oom-kill is invoked. Then, just using mm->anon_rss will not be correct.
> >
> > Hm, should we count # of swap entries reference from mm ?....
> 
> In Vedran case, he didn't use swap. So, Only considering vm is the problem.
> I think it would be better to consider both RSS + # of swap entries as
> Kosaki mentioned.
> 
Then, maybe this kind of patch is necessary.
This is on 2.6.31...then I may have to rebase this to mmotom.
Added more CCs.

Vedran, I'm glad if you can test this patch.


==
Now, oom-killer's score uses mm->total_vm as its base value.
But, in these days, applications like GUI program tend to use
much shared libraries and total_vm grows too high even when
pages are not fully mapped.

For example, running a program "mmap" which allocates 1 GBbytes of
anonymous memory, oom_score top 10 on system will be..

 score  PID     name
 89924	3938	mixer_applet2
 90210	3942	tomboy
 94753	3936	clock-applet
 101994	3919	pulseaudio
 113525	4028	gnome-terminal
 127340	1	init
 128177	3871	nautilus
 151003	11515	bash
 256944	11653	mmap <-----------------use 1G of anon
 425561	3829	gnome-session

No one believes gnome-session is more guilty than "mmap".

Instead of total_vm, we should use anon/file/swap usage of a process, I think.
This patch adds mm->swap_usage and calculate oom_score based on
  anon_rss + file_rss + swap_usage.
Considering usual applications, this will be much better information than
total_vm. After this patch, the score on my desktop is

score   PID     name
4033	3176	gnome-panel
4077	3113	xinit
4526	3190	python
4820	3161	gnome-settings-
4989	3289	gnome-terminal
7105	3271	tomboy
8427	3177	nautilus
17549	3140	gnome-session
128501	3299	bash
256106	3383	mmap

This order is not bad, I think.

Note: This adss new counter...then new cost is added.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 include/linux/mm_types.h |    1 +
 mm/memory.c              |   29 +++++++++++++++++++++--------
 mm/oom_kill.c            |   12 +++++++++---
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 5 files changed, 33 insertions(+), 11 deletions(-)

Index: linux-2.6.31/include/linux/mm_types.h
===================================================================
--- linux-2.6.31.orig/include/linux/mm_types.h
+++ linux-2.6.31/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6.31/mm/memory.c
===================================================================
--- linux-2.6.31.orig/mm/memory.c
+++ linux-2.6.31/mm/memory.c
@@ -361,12 +361,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline
+void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swaps)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swaps)
+		add_mm_counter(mm, swap_usage, swaps);
 }
 
 /*
@@ -562,6 +565,8 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			if (!is_migration_entry(entry))
+				rss[2]++;
 			if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
@@ -611,10 +616,10 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -645,7 +650,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(dst_pte - 1, dst_ptl);
 	cond_resched();
 	if (addr != end)
@@ -769,6 +774,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swaps = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -838,13 +844,19 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t entry = pte_to_swp_entry(ptent);
+
+			if (!is_migration_entry(entry))
+				swaps++;
+
+			if (unlikely(!free_swap_and_cache(entry)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swaps);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2573,6 +2585,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: linux-2.6.31/mm/rmap.c
===================================================================
--- linux-2.6.31.orig/mm/rmap.c
+++ linux-2.6.31/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: linux-2.6.31/mm/swapfile.c
===================================================================
--- linux-2.6.31.orig/mm/swapfile.c
+++ linux-2.6.31/mm/swapfile.c
@@ -830,6 +830,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6.31/mm/oom_kill.c
===================================================================
--- linux-2.6.31.orig/mm/oom_kill.c
+++ linux-2.6.31/mm/oom_kill.c
@@ -69,7 +69,8 @@ unsigned long badness(struct task_struct
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
-	points = mm->total_vm;
+	points = get_mm_counter(mm, anon_rss) + get_mm_counter(mm, file_rss)
+		 + get_mm_counter(mm, swap_usage);
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -92,8 +93,13 @@ unsigned long badness(struct task_struct
 	 */
 	list_for_each_entry(child, &p->children, sibling) {
 		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
+		if (child->mm != mm && child->mm) {
+			unsigned long cpoint;
+			/* At considering child, we don't count swap */
+			cpoint = get_mm_counter(child->mm, anon_rss) +
+				 get_mm_counter(child->mm, file_rss);
+			points += cpoint/2 + 1;
+		}
 		task_unlock(child);
 	}
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/