[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0e1ce040-1beb-fd96-683c-1b18eb635fd6@virtuozzo.com>
Date: Thu, 24 May 2018 14:24:37 +0300
From: Pavel Emelyanov <xemul@...tuozzo.com>
To: Mike Rapoport <rppt@...ux.vnet.ibm.com>,
Andrew Morton <akpm@...ux-foundation.org>
Cc: linux-mm <linux-mm@...ck.org>, lkml <linux-kernel@...r.kernel.org>,
Andrea Arcangeli <aarcange@...hat.com>,
Mike Kravetz <mike.kravetz@...cle.com>,
Andrei Vagin <avagin@...tuozzo.com>
Subject: Re: [PATCH] userfaultfd: prevent non-cooperative events vs
mcopy_atomic races
On 05/23/2018 10:42 AM, Mike Rapoport wrote:
> If a process monitored with userfaultfd changes it's memory mappings or
> forks() at the same time as uffd monitor fills the process memory with
> UFFDIO_COPY, the actual creation of page table entries and copying of the
> data in mcopy_atomic may happen either before of after the memory mapping
> modifications and there is no way for the uffd monitor to maintain
> consistent view of the process memory layout.
>
> For instance, let's consider fork() running in parallel with
> userfaultfd_copy():
>
> process | uffd monitor
> ---------------------------------+------------------------------
> fork() | userfaultfd_copy()
> ... | ...
> dup_mmap() | down_read(mmap_sem)
> down_write(mmap_sem) | /* create PTEs, copy data */
> dup_uffd() | up_read(mmap_sem)
> copy_page_range() |
> up_write(mmap_sem) |
> dup_uffd_complete() |
> /* notify monitor */ |
>
> If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> present by the time copy_page_range() is called and they will appear in the
> child's memory mappings. However, if the fork() is the first to take the
> mmap_sem, the new pages won't be mapped in the child's address space.
But in this case child should get an entry, that emits a message to uffd when step upon!
And uffd will just userfaultfd_copy() it again. No?
-- Pavel
> Since userfaultfd monitor has no way to determine what was the order, let's
> disallow userfaultfd_copy in parallel with the non-cooperative events. In
> such case we return -EAGAIN and the uffd monitor can understand that
> userfaultfd_copy() clashed with a non-cooperative event and take an
> appropriate action.
>
> Signed-off-by: Mike Rapoport <rppt@...ux.vnet.ibm.com>
> Cc: Andrea Arcangeli <aarcange@...hat.com>
> Cc: Mike Kravetz <mike.kravetz@...cle.com>
> Cc: Pavel Emelyanov <xemul@...tuozzo.com>
> Cc: Andrei Vagin <avagin@...tuozzo.com>
> ---
> fs/userfaultfd.c | 22 ++++++++++++++++++++--
> include/linux/userfaultfd_k.h | 6 ++++--
> mm/userfaultfd.c | 22 +++++++++++++++++-----
> 3 files changed, 41 insertions(+), 9 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index cec550c8468f..123bf7d516fc 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -62,6 +62,8 @@ struct userfaultfd_ctx {
> enum userfaultfd_state state;
> /* released */
> bool released;
> + /* memory mappings are changing because of non-cooperative event */
> + bool mmap_changing;
> /* mm with one ore more vmas attached to this userfaultfd_ctx */
> struct mm_struct *mm;
> };
> @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> * already released.
> */
> out:
> + WRITE_ONCE(ctx->mmap_changing, false);
> userfaultfd_ctx_put(ctx);
> }
>
> @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> ctx->state = UFFD_STATE_RUNNING;
> ctx->features = octx->features;
> ctx->released = false;
> + ctx->mmap_changing = false;
> ctx->mm = vma->vm_mm;
> mmgrab(ctx->mm);
>
> userfaultfd_ctx_get(octx);
> + WRITE_ONCE(octx->mmap_changing, true);
> fctx->orig = octx;
> fctx->new = ctx;
> list_add_tail(&fctx->list, fcs);
> @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
> vm_ctx->ctx = ctx;
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> }
> }
>
> @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> return true;
>
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> up_read(&mm->mmap_sem);
>
> msg_init(&ewq.msg);
> @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
> return -ENOMEM;
>
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> unmap_ctx->ctx = ctx;
> unmap_ctx->start = start;
> unmap_ctx->end = end;
> @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>
> user_uffdio_copy = (struct uffdio_copy __user *) arg;
>
> + ret = -EAGAIN;
> + if (READ_ONCE(ctx->mmap_changing))
> + goto out;
> +
> ret = -EFAULT;
> if (copy_from_user(&uffdio_copy, user_uffdio_copy,
> /* don't copy "copy" last field */
> @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
> goto out;
> if (mmget_not_zero(ctx->mm)) {
> ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> - uffdio_copy.len);
> + uffdio_copy.len, &ctx->mmap_changing);
> mmput(ctx->mm);
> } else {
> return -ESRCH;
> @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>
> user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
>
> + ret = -EAGAIN;
> + if (READ_ONCE(ctx->mmap_changing))
> + goto out;
> +
> ret = -EFAULT;
> if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
> /* don't copy "zeropage" last field */
> @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>
> if (mmget_not_zero(ctx->mm)) {
> ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
> - uffdio_zeropage.range.len);
> + uffdio_zeropage.range.len,
> + &ctx->mmap_changing);
> mmput(ctx->mm);
> } else {
> return -ESRCH;
> @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> ctx->features = 0;
> ctx->state = UFFD_STATE_WAIT_API;
> ctx->released = false;
> + ctx->mmap_changing = false;
> ctx->mm = current->mm;
> /* prevent the mm struct to be freed */
> mmgrab(ctx->mm);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index f2f3b68ba910..e091f0a11b11 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -31,10 +31,12 @@
> extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
>
> extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> - unsigned long src_start, unsigned long len);
> + unsigned long src_start, unsigned long len,
> + bool *mmap_changing);
> extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
> unsigned long dst_start,
> - unsigned long len);
> + unsigned long len,
> + bool *mmap_changing);
>
> /* mm helpers */
> static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 39791b81ede7..5029f241908f 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - bool zeropage)
> + bool zeropage,
> + bool *mmap_changing)
> {
> struct vm_area_struct *dst_vma;
> ssize_t err;
> @@ -431,6 +432,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> down_read(&dst_mm->mmap_sem);
>
> /*
> + * If memory mappings are changing because of non-cooperative
> + * operation (e.g. mremap) running in parallel, bail out and
> + * request the user to retry later
> + */
> + err = -EAGAIN;
> + if (mmap_changing && READ_ONCE(*mmap_changing))
> + goto out_unlock;
> +
> + /*
> * Make sure the vma is not shared, that the dst range is
> * both valid and fully within a single existing vma.
> */
> @@ -563,13 +573,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> }
>
> ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> - unsigned long src_start, unsigned long len)
> + unsigned long src_start, unsigned long len,
> + bool *mmap_changing)
> {
> - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
> + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
> + mmap_changing);
> }
>
> ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
> - unsigned long len)
> + unsigned long len, bool *mmap_changing)
> {
> - return __mcopy_atomic(dst_mm, start, 0, len, true);
> + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
> }
>
Powered by blists - more mailing lists