Introduce VM_PINNED and related machinery to solve a number of issues;

Firstly, various subsystems (perf, IB amongst others) 'pin'
significant chunks of memory (through holding page refs or custom
maps), because this memory is unevictable we must test this against
RLIMIT_MEMLOCK.

However, you can also mlock() these ranges, resulting in double
accounting. Patch bc3e53f682 ("mm: distinguish between mlocked and
pinned pages") split the counter into mm_struct::locked_vm and
mm_struct::pinned_vm, but did not add pinned_vm against the
RLIMIT_MEMLOCK test.

This resulted in that RLIMIT_MEMLOCK would under-account, and
effectively it would allow double the amount of memory to be
unevictable.

By introducing VM_PINNED and keeping track of these ranges as VMAs we
have sufficient information to account all pages without over or
under accounting any.

Secondly, due to the long-term pinning of pages things like CMA and
compaction get into trouble, because these pages (esp. for IB) start
their life as normal movable pages, but after the 'pinning' they're
not. This results in CMA and compaction fails.

By having a single common function: mm_mpin(), before the
get_user_pages() call, we can rectify this by migrating the pages to a
more suitable location -- this patch does not do this, but provides
the infrastructure to do so.

Thirdly, because VM_LOCKED does allow unmapping (and therefore page
migration) the -rt people are not pleased and would very much like
something stronger. This provides the required infrastructure (but not
the user interfaces).

Cc: Christoph Lameter <cl@linux.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/mm.h       |    3 +
 include/linux/mm_types.h |    5 +
 kernel/fork.c            |    2 
 mm/mlock.c               |  133 ++++++++++++++++++++++++++++++++++++++++++-----
 mm/mmap.c                |   18 ++++--
 5 files changed, 141 insertions(+), 20 deletions(-)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -113,6 +113,7 @@ extern unsigned int kobjsize(const void
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
+#define VM_PINNED	0x00001000
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
 
@@ -1808,6 +1809,8 @@ static inline void mm_populate(unsigned
 	/* Ignore errors */
 	(void) __mm_populate(addr, len, 1);
 }
+extern int mm_mpin(unsigned long start, unsigned long end);
+extern int mm_munpin(unsigned long start, unsigned long end);
 #else
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -469,6 +469,11 @@ static inline cpumask_t *mm_cpumask(stru
 	return mm->cpu_vm_mask_var;
 }
 
+static inline unsigned long mm_locked_pages(struct mm_struct *mm)
+{
+	return mm->pinned_vm + mm->locked_vm;
+}
+
 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 /*
  * Memory barriers to keep this state in sync are graciously provided by
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -410,6 +410,8 @@ static int dup_mmap(struct mm_struct *mm
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~VM_LOCKED;
+		if (tmp->vm_flags & VM_PINNED)
+			mm->pinned_vm += vma_pages(tmp);
 		tmp->vm_next = tmp->vm_prev = NULL;
 		file = tmp->vm_file;
 		if (file) {
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -549,9 +549,8 @@ static int mlock_fixup(struct vm_area_st
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
-	int nr_pages;
+	int nr_pages, nr_locked, nr_pinned;
 	int ret = 0;
-	int lock = !!(newflags & VM_LOCKED);
 
 	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -582,9 +581,49 @@ static int mlock_fixup(struct vm_area_st
 	 * Keep track of amount of locked VM.
 	 */
 	nr_pages = (end - start) >> PAGE_SHIFT;
-	if (!lock)
-		nr_pages = -nr_pages;
-	mm->locked_vm += nr_pages;
+
+	/*
+	 * We should only account pages once, if VM_PINNED is set pages are
+	 * accounted in mm_struct::pinned_vm, otherwise if VM_LOCKED is set,
+	 * we'll account them in mm_struct::locked_vm.
+	 *
+	 * PL  := vma->vm_flags
+	 * PL' := newflags
+	 * PLd := {pinned,locked}_vm delta
+	 *
+	 * PL->PL' PLd
+	 * -----------
+	 * 00  01  0+
+	 * 00  10  +0
+	 * 01  11  +-
+	 * 01  00  0-
+	 * 10  00  -0
+	 * 10  11  00
+	 * 11  01  -+
+	 * 11  10  00
+	 */
+
+	nr_pinned = nr_locked = 0;
+
+	if ((vma->vm_flags ^ newflags) & VM_PINNED) {
+		if (vma->vm_flags & VM_PINNED)
+			nr_pinned = -nr_pages;
+		else
+			nr_pinned = nr_pages;
+	}
+
+	if (vma->vm_flags & VM_PINNED) {
+		if ((newflags & (VM_PINNED|VM_LOCKED)) == VM_LOCKED)
+			nr_locked = nr_pages;
+	} else {
+		if (vma->vm_flags & VM_LOCKED)
+			nr_locked = -nr_pages;
+		else if (newflags & VM_LOCKED)
+			nr_locked = nr_pages;
+	}
+
+	mm->pinned_vm += nr_pinned;
+	mm->locked_vm += nr_locked;
 
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
@@ -592,7 +631,7 @@ static int mlock_fixup(struct vm_area_st
 	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 	 */
 
-	if (lock)
+	if (((vma->vm_flags ^ newflags) & VM_PINNED) || (newflags & VM_LOCKED))
 		vma->vm_flags = newflags;
 	else
 		munlock_vma_pages_range(vma, start, end);
@@ -602,12 +641,17 @@ static int mlock_fixup(struct vm_area_st
 	return ret;
 }
 
-static int do_mlock(unsigned long start, size_t len, int on)
+#define MLOCK_F_ON	0x01
+#define MLOCK_F_PIN	0x02
+
+static int do_mlock(unsigned long start, size_t len, unsigned int flags)
 {
 	unsigned long nstart, end, tmp;
 	struct vm_area_struct * vma, * prev;
 	int error;
 
+	lockdep_assert_held(&current->mm->mmap_sem);
+
 	VM_BUG_ON(start & ~PAGE_MASK);
 	VM_BUG_ON(len != PAGE_ALIGN(len));
 	end = start + len;
@@ -624,13 +668,18 @@ static int do_mlock(unsigned long start,
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		vm_flags_t newflags;
+		vm_flags_t newflags = vma->vm_flags;
+		vm_flags_t flag = VM_LOCKED;
 
-		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+		if (flags & MLOCK_F_PIN)
+			flag = VM_PINNED;
 
-		newflags = vma->vm_flags & ~VM_LOCKED;
-		if (on)
-			newflags |= VM_LOCKED;
+		if (flags & MLOCK_F_ON)
+			newflags |= flag;
+		else
+			newflags &= ~flag;
+
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
 		tmp = vma->vm_end;
 		if (tmp > end)
@@ -653,6 +702,62 @@ static int do_mlock(unsigned long start,
 	return error;
 }
 
+/**
+ * mm_mpin - create a pinned vma
+ * @start - vaddr to start the vma
+ * @len - size of the vma
+ *
+ * Creates a pinned vma, where pinning is similar in locked in that the pages
+ * will be unevictable, but stronger in that the pages will be unmappable as
+ * well. Typically this is called before a driver does get_user_pages() on a
+ * chunk of memory on behalf of a user.
+ *
+ * Returns 0 for success, otherwise:
+ * -EPERM - the caller is not privilidged
+ * -ENOMEM - the called exceeded RLIMIT_MEMLOCK
+ * -ENOMEM - failed to allocate sufficient memory
+ */
+int mm_mpin(unsigned long start, size_t len)
+{
+	unsigned long locked, lock_limit;
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK);
+	lock_limit >>= PAGE_SHIFT;
+	locked = len >> PAGE_SHIFT;
+	locked += mm_locked_pages(current->mm);
+
+	if (!((locked <= lock_limit) || capable(CAP_IPC_LOCK)))
+		return -ENOMEM;
+
+	/*
+	 * Because we're typically called before a long-term get_user_pages()
+	 * call, this is a good spot to avoid eviction related problems:
+	 *
+	 * TODO; migrate all these pages out from CMA regions.
+	 * TODO; migrate the pages to UNMOVABLE page blocks.
+	 * TODO; linearize these pages to avoid compaction issues.
+	 */
+	return do_mlock(start, len, MLOCK_F_ON | MLOCK_F_PIN);
+
+}
+EXPORT_SYMBOL_GPL(mm_mpin);
+
+/**
+ * mm_munpin - destroys a pinned vma
+ * @start - vaddr of the vma start
+ * @len - size of the vma
+ *
+ * Undoes mm_mpin().
+ */
+int mm_munpin(unsigned long start, size_t len)
+{
+	return do_mlock(start, start + len, MLOCK_F_PIN);
+}
+EXPORT_SYMBOL_GPL(mm_munpin);
+
 /*
  * __mm_populate - populate and/or mlock pages within a range of address space.
  *
@@ -736,11 +841,11 @@ SYSCALL_DEFINE2(mlock, unsigned long, st
 
 	down_write(&current->mm->mmap_sem);
 
-	locked += current->mm->locked_vm;
+	locked += mm_locked_pages(current->mm);
 
 	/* check against resource limits */
 	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
-		error = do_mlock(start, len, 1);
+		error = do_mlock(start, len, MLOCK_F_ON);
 
 	up_write(&current->mm->mmap_sem);
 	if (!error)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1210,7 +1210,7 @@ static inline int mlock_future_check(str
 	/*  mlock MCL_FUTURE? */
 	if (flags & VM_LOCKED) {
 		locked = len >> PAGE_SHIFT;
-		locked += mm->locked_vm;
+		locked += mm_locked_pages(mm);
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
@@ -1616,7 +1616,9 @@ unsigned long mmap_region(struct file *f
 	perf_event_mmap(vma);
 
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
-	if (vm_flags & VM_LOCKED) {
+	if (vm_flags & VM_PINNED) {
+		mm->pinned_vm += (len >> PAGE_SHIFT);
+	} else if (vm_flags & VM_LOCKED) {
 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm)))
 			mm->locked_vm += (len >> PAGE_SHIFT);
@@ -2069,7 +2071,7 @@ static int acct_stack_growth(struct vm_a
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked;
 		unsigned long limit;
-		locked = mm->locked_vm + grow;
+		locked = mm_locked_pages(mm) + grow;
 		limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
 		limit >>= PAGE_SHIFT;
 		if (locked > limit && !capable(CAP_IPC_LOCK))
@@ -2538,13 +2540,17 @@ int do_munmap(struct mm_struct *mm, unsi
 	/*
 	 * unlock any mlock()ed ranges before detaching vmas
 	 */
-	if (mm->locked_vm) {
+	if (mm->locked_vm || mm->pinned_vm) {
 		struct vm_area_struct *tmp = vma;
 		while (tmp && tmp->vm_start < end) {
-			if (tmp->vm_flags & VM_LOCKED) {
+			if (tmp->vm_flags & VM_PINNED)
+				mm->pinned_vm -= vma_pages(tmp);
+			else if (tmp->vm_flags & VM_LOCKED)
 				mm->locked_vm -= vma_pages(tmp);
+
+			if (tmp->vm_flags & VM_LOCKED)
 				munlock_vma_pages_all(tmp);
-			}
+
 			tmp = tmp->vm_next;
 		}
 	}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/