[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87fsgbf3gh.fsf@mpe.ellerman.id.au>
Date: Thu, 29 Sep 2022 10:07:26 +1000
From: Michael Ellerman <mpe@...erman.id.au>
To: Alistair Popple <apopple@...dia.com>, linux-mm@...ck.org,
Andrew Morton <akpm@...ux-foundation.org>
Cc: Nicholas Piggin <npiggin@...il.com>,
Felix Kuehling <Felix.Kuehling@....com>,
Alex Deucher <alexander.deucher@....com>,
Christian König <christian.koenig@....com>,
"Pan, Xinhui" <Xinhui.Pan@....com>,
David Airlie <airlied@...ux.ie>,
Daniel Vetter <daniel@...ll.ch>,
Ben Skeggs <bskeggs@...hat.com>,
Karol Herbst <kherbst@...hat.com>,
Lyude Paul <lyude@...hat.com>,
Ralph Campbell <rcampbell@...dia.com>,
"Matthew Wilcox (Oracle)" <willy@...radead.org>,
Alex Sierra <alex.sierra@....com>,
John Hubbard <jhubbard@...dia.com>,
linuxppc-dev@...ts.ozlabs.org, linux-kernel@...r.kernel.org,
amd-gfx@...ts.freedesktop.org, nouveau@...ts.freedesktop.org,
dri-devel@...ts.freedesktop.org, Jason Gunthorpe <jgg@...dia.com>,
Dan Williams <dan.j.williams@...el.com>,
Alistair Popple <apopple@...dia.com>
Subject: Re: [PATCH 1/7] mm/memory.c: Fix race when faulting a device
private page
Alistair Popple <apopple@...dia.com> writes:
> When the CPU tries to access a device private page the migrate_to_ram()
> callback associated with the pgmap for the page is called. However no
> reference is taken on the faulting page. Therefore a concurrent
> migration of the device private page can free the page and possibly the
> underlying pgmap. This results in a race which can crash the kernel due
> to the migrate_to_ram() function pointer becoming invalid. It also means
> drivers can't reliably read the zone_device_data field because the page
> may have been freed with memunmap_pages().
>
> Close the race by getting a reference on the page while holding the ptl
> to ensure it has not been freed. Unfortunately the elevated reference
> count will cause the migration required to handle the fault to fail. To
> avoid this failure pass the faulting page into the migrate_vma functions
> so that if an elevated reference count is found it can be checked to see
> if it's expected or not.
>
> Signed-off-by: Alistair Popple <apopple@...dia.com>
> ---
> arch/powerpc/kvm/book3s_hv_uvmem.c | 15 ++++++-----
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++------
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++---
> include/linux/migrate.h | 8 ++++++-
> lib/test_hmm.c | 7 ++---
> mm/memory.c | 16 +++++++++++-
> mm/migrate.c | 34 ++++++++++++++-----------
> mm/migrate_device.c | 18 +++++++++----
> 9 files changed, 87 insertions(+), 41 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 5980063..d4eacf4 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
> static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
> unsigned long start,
> unsigned long end, unsigned long page_shift,
> - struct kvm *kvm, unsigned long gpa)
> + struct kvm *kvm, unsigned long gpa, struct page *fault_page)
> {
> unsigned long src_pfn, dst_pfn = 0;
> - struct migrate_vma mig;
> + struct migrate_vma mig = { 0 };
> struct page *dpage, *spage;
> struct kvmppc_uvmem_page_pvt *pvt;
> unsigned long pfn;
> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
> mig.dst = &dst_pfn;
> mig.pgmap_owner = &kvmppc_uvmem_pgmap;
> mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
> + mig.fault_page = fault_page;
>
> /* The requested page is already paged-out, nothing to do */
> if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
> static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long page_shift,
> - struct kvm *kvm, unsigned long gpa)
> + struct kvm *kvm, unsigned long gpa,
> + struct page *fault_page)
> {
> int ret;
>
> mutex_lock(&kvm->arch.uvmem_lock);
> - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
> + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
> + fault_page);
> mutex_unlock(&kvm->arch.uvmem_lock);
>
> return ret;
> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
> bool pagein)
> {
> unsigned long src_pfn, dst_pfn = 0;
> - struct migrate_vma mig;
> + struct migrate_vma mig = { 0 };
> struct page *spage;
> unsigned long pfn;
> struct page *dpage;
> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
>
> if (kvmppc_svm_page_out(vmf->vma, vmf->address,
> vmf->address + PAGE_SIZE, PAGE_SHIFT,
> - pvt->kvm, pvt->gpa))
> + pvt->kvm, pvt->gpa, vmf->page))
> return VM_FAULT_SIGBUS;
> else
> return 0;
I don't have a UV test system, but as-is it doesn't even compile :)
kvmppc_svm_page_out() is called via some paths other than the
migrate_to_ram callback.
I think it's correct to just pass fault_page = NULL when it's not called
from the migrate_to_ram callback?
Incremental diff below.
cheers
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index d4eacf410956..965c9e9e500b 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -637,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
pvt->remove_gfn = true;
if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
- PAGE_SHIFT, kvm, pvt->gpa))
+ PAGE_SHIFT, kvm, pvt->gpa, NULL))
pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
pvt->gpa, addr);
} else {
@@ -1068,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
if (!vma || vma->vm_start > start || vma->vm_end < end)
goto out;
- if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa))
+ if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL))
ret = H_SUCCESS;
out:
mmap_read_unlock(kvm->mm);
Powered by blists - more mailing lists