linux-kernel - [PATCH v10 01/16] vrange: Add vrange support to mm

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1388646744-15608-2-git-send-email-minchan@kernel.org>
Date:	Thu,  2 Jan 2014 16:12:09 +0900
From:	Minchan Kim <minchan@...nel.org>
To:	linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Mel Gorman <mgorman@...e.de>, Hugh Dickins <hughd@...gle.com>,
	Dave Hansen <dave.hansen@...el.com>,
	Rik van Riel <riel@...hat.com>,
	KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>,
	Michel Lespinasse <walken@...gle.com>,
	Johannes Weiner <hannes@...xchg.org>,
	John Stultz <john.stultz@...aro.org>,
	Dhaval Giani <dhaval.giani@...il.com>,
	"H. Peter Anvin" <hpa@...or.com>,
	Android Kernel Team <kernel-team@...roid.com>,
	Robert Love <rlove@...gle.com>, Mel Gorman <mel@....ul.ie>,
	Dmitry Adamushko <dmitry.adamushko@...il.com>,
	Dave Chinner <david@...morbit.com>, Neil Brown <neilb@...e.de>,
	Andrea Righi <andrea@...terlinux.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>,
	Mike Hommey <mh@...ndium.org>, Taras Glek <tglek@...illa.com>,
	Jan Kara <jack@...e.cz>,
	KOSAKI Motohiro <kosaki.motohiro@...il.com>,
	Rob Clark <robdclark@...il.com>, Jason Evans <je@...com>,
	Minchan Kim <minchan@...nel.org>
Subject: [PATCH v10 01/16] vrange: Add vrange support to mm_structs

This patch adds vroot on mm_struct so process can set volatile
ranges on anonymous memory.

This is somewhat wasteful, as it increases the mm struct even
if the process doesn't use vrange syscall. So a later patch
will provide dynamically allocated vroots.

One of note on this patch is vrange_fork. Since we do allocations
while holding a lock on the vrange, its possible it could deadlock
with direct reclaim's purging logic. For this reason, vrange_fork
uses GFP_NOIO for its allocations.

If vrange_fork fails, it isn't a critical problem. Since the result
is the child process's pages won't be volatile/purgable, which
could cause additional memory pressure, but won't cause problematic
application behavior (since volatile pages are only purged at the
kernels' discretion). This is thought to be more desirable then
having fork fail.

NOTE: Additionally, as a optimization point, we could remove pages
like MADV_DONTNEED instantly when we see the allocation fail.
There would be no point to make new volatile ranges when memory
pressure was already tight.

Cc: Mel Gorman <mel@....ul.ie>
Cc: Hugh Dickins <hughd@...gle.com>
Cc: Dave Hansen <dave.hansen@...el.com>
Cc: Rik van Riel <riel@...hat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@...il.com>
Cc: Michel Lespinasse <walken@...gle.com>
Cc: Johannes Weiner <hannes@...xchg.org>
[jstultz: Bit of refactoring. Comment cleanups]
Signed-off-by: John Stultz <john.stultz@...aro.org>
Signed-off-by: Minchan Kim <minchan@...nel.org>
---
 include/linux/mm_types.h |    4 ++++
 include/linux/vrange.h   |    7 ++++++-
 kernel/fork.c            |   11 +++++++++++
 mm/vrange.c              |   40 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851eeb6e1d..a4de9cfa8ff1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -13,6 +13,7 @@
 #include <linux/page-debug-flags.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/vrange_types.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -350,6 +351,9 @@ struct mm_struct {
 						 */
 
 
+#ifdef CONFIG_MMU
+	struct vrange_root vroot;
+#endif
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 
diff --git a/include/linux/vrange.h b/include/linux/vrange.h
index 0d378a5dc8d7..2b96ee1ee75b 100644
--- a/include/linux/vrange.h
+++ b/include/linux/vrange.h
@@ -37,12 +37,17 @@ static inline int vrange_type(struct vrange *vrange)
 }
 
 extern void vrange_root_cleanup(struct vrange_root *vroot);
-
+extern int vrange_fork(struct mm_struct *new,
+					struct mm_struct *old);
 #else
 
 static inline void vrange_root_init(struct vrange_root *vroot,
 					int type, void *object) {};
 static inline void vrange_root_cleanup(struct vrange_root *vroot) {};
+static inline int vrange_fork(struct mm_struct *new, struct mm_struct *old)
+{
+	return 0;
+}
 
 #endif
 #endif /* _LINIUX_VRANGE_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..36d3c4bb4c4d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -71,6 +71,7 @@
 #include <linux/signalfd.h>
 #include <linux/uprobes.h>
 #include <linux/aio.h>
+#include <linux/vrange.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -376,6 +377,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	retval = khugepaged_fork(mm, oldmm);
 	if (retval)
 		goto out;
+	/*
+	 * Note: vrange_fork can fail in the case of ENOMEM, but
+	 * this only results in the child not having any active
+	 * volatile ranges. This is not harmful. Thus in this case
+	 * the child will not see any pages purged unless it remarks
+	 * them as volatile.
+	 */
+	vrange_fork(mm, oldmm);
 
 	prev = NULL;
 	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -535,6 +544,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->nr_ptes = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
+	vrange_root_init(&mm->vroot, VRANGE_MM, mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 
@@ -606,6 +616,7 @@ void mmput(struct mm_struct *mm)
 
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		uprobe_clear_state(mm);
+		vrange_root_cleanup(&mm->vroot);
 		exit_aio(mm);
 		ksm_exit(mm);
 		khugepaged_exit(mm); /* must run before exit_mmap */
diff --git a/mm/vrange.c b/mm/vrange.c
index a5daea44e031..57dad4d72b04 100644
--- a/mm/vrange.c
+++ b/mm/vrange.c
@@ -182,3 +182,43 @@ void vrange_root_cleanup(struct vrange_root *vroot)
 	vrange_unlock(vroot);
 }
 
+/*
+ * It's okay to fail vrange_fork because worst case is child process
+ * can't have copied own vrange data structure so that pages in the
+ * vrange couldn't be purged. It would be better rather than failing
+ * fork.
+ */
+int vrange_fork(struct mm_struct *new_mm, struct mm_struct *old_mm)
+{
+	struct vrange_root *new, *old;
+	struct vrange *range, *new_range;
+	struct rb_node *next;
+
+	new = &new_mm->vroot;
+	old = &old_mm->vroot;
+
+	vrange_lock(old);
+	next = rb_first(&old->v_rb);
+	while (next) {
+		range = vrange_entry(next);
+		next = rb_next(next);
+		/*
+		 * We can't use GFP_KERNEL because direct reclaim's
+		 * purging logic on vrange could be deadlock by
+		 * vrange_lock.
+		 */
+		new_range = __vrange_alloc(GFP_NOIO);
+		if (!new_range)
+			goto fail;
+		__vrange_set(new_range, range->node.start,
+					range->node.last, range->purged);
+		__vrange_add(new_range, new);
+
+	}
+	vrange_unlock(old);
+	return 0;
+fail:
+	vrange_unlock(old);
+	vrange_root_cleanup(new);
+	return -ENOMEM;
+}
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/