[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <2e91ea19fbd30fa17718cb293473ae207ee8fd0f.1355536006.git.luto@amacapital.net>
Date: Fri, 14 Dec 2012 18:17:21 -0800
From: Andy Lutomirski <luto@...capital.net>
To: linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
Al Viro <viro@...iv.linux.org.uk>,
Ingo Molnar <mingo@...nel.org>,
Michel Lespinasse <walken@...gle.com>,
Hugh Dickins <hughd@...gle.com>,
Jörn Engel <joern@...fs.org>,
Andy Lutomirski <luto@...capital.net>
Subject: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
This is a serious cause of mmap_sem contention. MAP_POPULATE
and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
Signed-off-by: Andy Lutomirski <luto@...capital.net>
---
Changes from v1:
The non-unlocking versions of do_mmap_pgoff and mmap_region are still
available for aio_setup_ring's benefit. In theory, aio_setup_ring
would do better with a lock-downgrading version, but that would be
somewhat ugly and doesn't help my workload.
arch/tile/mm/elf.c | 9 +++---
fs/aio.c | 4 +++
include/linux/mm.h | 19 ++++++++++--
ipc/shm.c | 6 ++--
mm/fremap.c | 10 ++++--
mm/mmap.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
mm/util.c | 3 +-
7 files changed, 117 insertions(+), 23 deletions(-)
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98b..a0441f2 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -129,12 +129,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
*/
if (!retval) {
unsigned long addr = MEM_USER_INTRPT;
- addr = mmap_region(NULL, addr, INTRPT_SIZE,
- MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+ addr = mmap_region_unlock(NULL, addr, INTRPT_SIZE,
+ MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
if (addr > (unsigned long) -PAGE_SIZE)
retval = (int) addr;
+ return retval; /* We already unlocked mmap_sem. */
}
#endif
diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..253396c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -127,6 +127,10 @@ static int aio_setup_ring(struct kioctx *ctx)
info->mmap_size = nr_pages * PAGE_SIZE;
dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
down_write(&ctx->mm->mmap_sem);
+ /*
+ * XXX: If MCL_FUTURE is set, this will hold mmap_sem for write for
+ * longer than necessary.
+ */
info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
PROT_READ|PROT_WRITE,
MAP_ANONYMOUS|MAP_PRIVATE, 0);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e..139f636 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,14 +1441,27 @@ extern int install_special_mapping(struct mm_struct *mm,
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+/* These must be called with mmap_sem held for write. */
extern unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
- unsigned long, unsigned long,
- unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff(struct file *, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff);
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+/*
+ * These must be called with mmap_sem held for write, and they will release
+ * mmap_sem before they return. They hold mmap_sem for a shorter time than
+ * the non-unlocking variants.
+ */
+extern unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags, unsigned long pgoff);
+extern unsigned long do_mmap_pgoff_unlock(struct file *, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff);
+
/* These take the mm semaphore themselves */
extern unsigned long vm_brk(unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9..d0001c8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1068,12 +1068,14 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
addr > current->mm->start_stack - size - PAGE_SIZE * 5)
goto invalid;
}
-
- user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
+
+ user_addr = do_mmap_pgoff_unlock(file, addr, size, prot, flags, 0);
*raddr = user_addr;
err = 0;
if (IS_ERR_VALUE(user_addr))
err = (long)user_addr;
+ goto out_fput;
+
invalid:
up_write(¤t->mm->mmap_sem);
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e..7ebe0a4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -200,8 +200,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
struct file *file = get_file(vma->vm_file);
flags &= MAP_NONBLOCK;
- addr = mmap_region(file, start, size,
- flags, vma->vm_flags, pgoff);
+ addr = mmap_region_unlock(file, start, size,
+ flags, vma->vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
@@ -209,7 +209,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
BUG_ON(addr != start);
err = 0;
}
- goto out;
+ return err; /* We already unlocked. */
}
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
@@ -237,6 +237,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
/*
* might be mapping previously unmapped range of file
*/
+ if (unlikely(has_write_lock)) {
+ downgrade_write(&mm->mmap_sem);
+ has_write_lock = 0;
+ }
mlock_vma_pages_range(vma, start, start + size);
} else {
if (unlikely(has_write_lock)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c4..2dd6b2f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -995,13 +995,22 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags,
+ unsigned long pgoff, int *downgraded);
+
/*
* The caller must hold down_write(¤t->mm->mmap_sem).
+ *
+ * If downgraded is non-null, do_mmap_pgoff_helper may downgrade mmap_sem
+ * and write 1 to *downgraded.
*/
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
+static unsigned long do_mmap_pgoff_helper(
+ struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff,
+ int *downgraded)
{
struct mm_struct * mm = current->mm;
struct inode *inode;
@@ -1127,7 +1136,32 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
}
- return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+ return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff,
+ downgraded);
+}
+
+
+unsigned long do_mmap_pgoff_unlock(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff)
+{
+ int downgraded = 0;
+ unsigned long ret = do_mmap_pgoff_helper(file, addr, len,
+ prot, flags, pgoff, &downgraded);
+
+ if (downgraded)
+ up_read(¤t->mm->mmap_sem);
+ else
+ up_write(¤t->mm->mmap_sem);
+
+ return ret;
+}
+
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff)
+{
+ return do_mmap_pgoff_helper(file, addr, len, prot, flags, pgoff, 0);
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1240,9 +1274,14 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, unsigned long flags,
- vm_flags_t vm_flags, unsigned long pgoff)
+/*
+ * If downgraded is null, then mmap_sem won't be touched. Otherwise it
+ * may be downgraded, in which case *downgraded will be set to 1.
+ */
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags,
+ unsigned long pgoff, int *downgraded)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1373,10 +1412,19 @@ out:
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
+ if (downgraded) {
+ downgrade_write(&mm->mmap_sem);
+ *downgraded = 1;
+ }
if (!mlock_vma_pages_range(vma, addr, addr + len))
mm->locked_vm += (len >> PAGE_SHIFT);
- } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+ } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) {
+ if (downgraded) {
+ downgrade_write(&mm->mmap_sem);
+ *downgraded = 1;
+ }
make_pages_present(addr, addr + len);
+ }
if (file)
uprobe_mmap(vma);
@@ -1400,6 +1448,29 @@ unacct_error:
return error;
}
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags, unsigned long pgoff)
+{
+ return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff, 0);
+}
+
+unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags, unsigned long pgoff)
+{
+ int downgraded = 0;
+ unsigned long ret = mmap_region_helper(file, addr, len,
+ flags, vm_flags, pgoff, &downgraded);
+
+ if (downgraded)
+ up_read(¤t->mm->mmap_sem);
+ else
+ up_write(¤t->mm->mmap_sem);
+
+ return ret;
+}
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
diff --git a/mm/util.c b/mm/util.c
index dc3036c..fd54884 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -359,8 +359,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
ret = security_mmap_file(file, prot, flag);
if (!ret) {
down_write(&mm->mmap_sem);
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
- up_write(&mm->mmap_sem);
+ ret = do_mmap_pgoff_unlock(file, addr, len, prot, flag, pgoff);
}
return ret;
}
--
1.7.11.7
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists