lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <1412307679-2458-1-git-send-email-danielmicay@gmail.com>
Date:	Thu,  2 Oct 2014 23:41:19 -0400
From:	Daniel Micay <danielmicay@...il.com>
To:	linux-mm@...ck.org
Cc:	linux-kernel@...r.kernel.org, linux-api@...r.kernel.org,
	akpm@...ux-foundation.org, jasone@...onware.com,
	luto@...capital.net, Daniel Micay <danielmicay@...il.com>
Subject: [PATCH v4] mm: add mremap flag for preserving the old mapping

This introduces the MREMAP_RETAIN flag for preserving the source mapping
when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
the source mapping will fault and map in fresh zeroed pages.

It is currently limited to writable MAP_PRIVATE|MAP_ANONYMOUS mappings
and will return EFAULT when used on anything else. This covers the
intended use case in general purpose allocators.

For consistency, the old_len >= new_len case could decommit the pages
instead of unmapping. However, userspace can accomplish the same thing
via madvise and the flag is coherent without the additional complexity.

Motivation:

TCMalloc and jemalloc avoid releasing virtual memory in order to reduce
virtual memory fragmentation. A call to munmap or mremap would leave a
hole in the address space. Instead, unused pages are lazily returned to
the operating system via MADV_DONTNEED.

Since mremap cannot be used to elide copies, TCMalloc and jemalloc end
up being significantly slower for patterns like repeated vector / hash
table reallocations. Consider the typical vector building pattern:

    #include <string.h>
    #include <stdlib.h>

    int main(void) {
        for (size_t i = 0; i < 100; i++) {
            void *ptr = NULL;
            size_t old_size = 0;
            for (size_t size = 4; size < (1 << 30); size *= 2) {
                ptr = realloc(ptr, size);
                if (!ptr) return 1;
                memset(ptr + old_size, 0xff, size - old_size);
                old_size = size;
            }
            free(ptr);
        }
    }

Transparent huge pages disabled:

glibc (baseline, uses mremap already): 15.051s
jemalloc without MREMAP_RETAIN: 38.540s
jemalloc with MREMAP_RETAIN: 15.086s

Transparent huge pages enabled:

glibc (baseline, uses mremap already): 8.464s
jemalloc without MREMAP_RETAIN: 18.230s
jemalloc with MREMAP_RETAIN: 6.696s

In practice, in-place growth never occurs for huge allocations because
the heap grows in the downwards direction for all 3 allocators. TCMalloc
and jemalloc pay for enormous copies while glibc is only spending time
writing new elements to the vector. Even if it was grown in the other
direction, real-world applications would end up blocking in-place growth
with new allocations.

The allocators could attempt to map the source location again after an
mremap call, but there is no guarantee of success in a multi-threaded
program and fragmentating memory over time is considered unacceptable.

Signed-off-by: Daniel Micay <danielmicay@...il.com>
---
 include/linux/huge_mm.h   |  2 +-
 include/linux/mm.h        |  6 ++++++
 include/uapi/linux/mman.h |  1 +
 mm/huge_memory.c          |  4 ++--
 mm/memory.c               |  2 +-
 mm/mmap.c                 | 12 +++++++++++
 mm/mremap.c               | 52 +++++++++++++++++++++++++++++++----------------
 7 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb..3c13b20 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -143,7 +143,7 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long end,
 					 long adjust_next)
 {
-	if (!vma->anon_vma || vma->vm_ops)
+	if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages))
 		return;
 	__vma_adjust_trans_huge(vma, start, end, adjust_next);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8981cc8..1e61036 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -273,6 +273,12 @@ struct vm_operations_struct {
 	/* called by sys_remap_file_pages() to populate non-linear mapping */
 	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
 			   unsigned long size, pgoff_t pgoff);
+
+	/* Check if the mapping may be duplicated by MREMAP_RETAIN */
+	bool (*may_duplicate)(struct vm_area_struct *vma);
+
+	/* if there is no vm_ops table, this is considered true */
+	bool allow_huge_pages;
 };
 
 struct mmu_gather;
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..4e9a546 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
+#define MREMAP_RETAIN	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06..350b478 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2077,7 +2077,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 		 * page fault if needed.
 		 */
 		return 0;
-	if (vma->vm_ops)
+	if ((vma->vm_ops && !vma->vm_ops->allow_huge_pages))
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
 	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
@@ -2405,7 +2405,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
 	    (vma->vm_flags & VM_NOHUGEPAGE))
 		return false;
 
-	if (!vma->anon_vma || vma->vm_ops)
+	if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages))
 		return false;
 	if (is_vma_temporary_stack(vma))
 		return false;
diff --git a/mm/memory.c b/mm/memory.c
index e229970..c181401 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3275,7 +3275,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
 		int ret = VM_FAULT_FALLBACK;
-		if (!vma->vm_ops)
+		if (!vma->vm_ops || vma->vm_ops->allow_huge_pages)
 			ret = do_huge_pmd_anonymous_page(mm, vma, address,
 					pmd, flags);
 		if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/mmap.c b/mm/mmap.c
index c0a3637..6b644fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1500,6 +1500,16 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
+static bool anon_may_duplicate(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_WRITE && !(vma->vm_flags & VM_SHARED);
+}
+
+static const struct vm_operations_struct anon_vmops = {
+	.may_duplicate = anon_may_duplicate,
+	.allow_huge_pages = true
+};
+
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
 {
@@ -1569,6 +1579,8 @@ munmap_back:
 	vma->vm_flags = vm_flags;
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
 	vma->vm_pgoff = pgoff;
+	if (!file)
+		vma->vm_ops = &anon_vmops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	if (file) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180..ca7a662 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -235,7 +235,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr, bool retain,
+		bool *locked)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -287,15 +288,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		old_len = new_len;
 		old_addr = new_addr;
 		new_addr = -ENOMEM;
-	}
-
-	/* Conceal VM_ACCOUNT so old reservation is not undone */
-	if (vm_flags & VM_ACCOUNT) {
-		vma->vm_flags &= ~VM_ACCOUNT;
-		excess = vma->vm_end - vma->vm_start - old_len;
-		if (old_addr > vma->vm_start &&
-		    old_addr + old_len < vma->vm_end)
-			split = 1;
+		retain = false;
 	}
 
 	/*
@@ -310,6 +303,19 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	hiwater_vm = mm->hiwater_vm;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
+	/* Leave the old mapping in place for MREMAP_RETAIN */
+	if (retain)
+		goto out;
+
+	/* Conceal VM_ACCOUNT so old reservation is not undone */
+	if (vm_flags & VM_ACCOUNT) {
+		vma->vm_flags &= ~VM_ACCOUNT;
+		excess = vma->vm_end - vma->vm_start - old_len;
+		if (old_addr > vma->vm_start &&
+		    old_addr + old_len < vma->vm_end)
+			split = 1;
+	}
+
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -324,6 +330,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_next->vm_flags |= VM_ACCOUNT;
 	}
 
+out:
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		*locked = true;
@@ -333,7 +340,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
-	unsigned long old_len, unsigned long new_len, unsigned long *p)
+	unsigned long old_len, unsigned long new_len, bool retain,
+	unsigned long *p)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = find_vma(mm, addr);
@@ -348,6 +356,11 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (old_len > vma->vm_end - addr)
 		goto Efault;
 
+	/* Forbid MREMAP_RETAIN if not explicitly permitted by the mapping */
+	if (retain && !(vma->vm_ops && vma->vm_ops->may_duplicate &&
+	    vma->vm_ops->may_duplicate(vma)))
+		goto Efault;
+
 	/* Need to be careful about a growing mapping */
 	if (new_len > old_len) {
 		unsigned long pgoff;
@@ -392,7 +405,8 @@ Eagain:
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool retain,
+		bool *locked)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -426,7 +440,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		old_len = new_len;
 	}
 
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, retain, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
@@ -442,7 +456,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (ret & ~PAGE_MASK)
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, retain, locked);
 	if (!(ret & ~PAGE_MASK))
 		goto out;
 out1:
@@ -482,7 +496,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_RETAIN))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +520,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				flags & MREMAP_RETAIN, &locked);
 		goto out;
 	}
 
@@ -526,7 +540,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	/*
 	 * Ok, we need to grow..
 	 */
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, flags & MREMAP_RETAIN,
+			    &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
@@ -575,7 +590,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr,
+			       flags & MREMAP_RETAIN, &locked);
 	}
 out:
 	if (ret & ~PAGE_MASK)
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ