lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190727171047.31610-1-longman@redhat.com>
Date:   Sat, 27 Jul 2019 13:10:47 -0400
From:   Waiman Long <longman@...hat.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>
Cc:     linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        Andrew Morton <akpm@...ux-foundation.org>,
        Phil Auld <pauld@...hat.com>, Waiman Long <longman@...hat.com>
Subject: [PATCH v2] sched/core: Don't use dying mm as active_mm of kthreads

It was found that a dying mm_struct where the owning task has exited
can stay on as active_mm of kernel threads as long as no other user
tasks run on those CPUs that use it as active_mm. This prolongs the
life time of dying mm holding up memory and other resources like swap
space that cannot be freed.

Fix that by forcing the kernel threads to use init_mm as the active_mm
if the previous active_mm is dying.

The determination of a dying mm is based on the absence of an owning
task. The selection of the owning task only happens with the CONFIG_MEMCG
option. Without that, there is no simple way to determine the life span
of a given mm. So it falls back to the old behavior.

Signed-off-by: Waiman Long <longman@...hat.com>
---
 include/linux/mm_types.h | 15 +++++++++++++++
 kernel/sched/core.c      | 13 +++++++++++--
 mm/init-mm.c             |  4 ++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3a37a89eb7a7..32712e78763c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -623,6 +623,21 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
 	return atomic_read(&mm->tlb_flush_pending) > 1;
 }
 
+#ifdef CONFIG_MEMCG
+/*
+ * A mm is considered dying if there is no owning task.
+ */
+static inline bool mm_dying(struct mm_struct *mm)
+{
+	return !mm->owner;
+}
+#else
+static inline bool mm_dying(struct mm_struct *mm)
+{
+	return false;
+}
+#endif
+
 struct vm_fault;
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..923a63262dfd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3233,13 +3233,22 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * Both of these contain the full memory barrier required by
 	 * membarrier after storing to rq->curr, before returning to
 	 * user-space.
+	 *
+	 * If mm is NULL and oldmm is dying (!owner), we switch to
+	 * init_mm instead to make sure that oldmm can be freed ASAP.
 	 */
-	if (!mm) {
+	if (!mm && !mm_dying(oldmm)) {
 		next->active_mm = oldmm;
 		mmgrab(oldmm);
 		enter_lazy_tlb(oldmm, next);
-	} else
+	} else {
+		if (!mm) {
+			mm = &init_mm;
+			next->active_mm = mm;
+			mmgrab(mm);
+		}
 		switch_mm_irqs_off(oldmm, mm, next);
+	}
 
 	if (!prev->mm) {
 		prev->active_mm = NULL;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..69090a11249c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
+#include <linux/sched/task.h>
 
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
@@ -36,5 +37,8 @@ struct mm_struct init_mm = {
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
+#ifdef CONFIG_MEMCG
+	.owner		= &init_task,
+#endif
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
2.18.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ