[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251213170002.5babbf70.alex@shazbot.org>
Date: Sat, 13 Dec 2025 17:00:02 +0900
From: Alex Williamson <alex@...zbot.org>
To: <ankita@...dia.com>
Cc: <vsethi@...dia.com>, <jgg@...dia.com>, <mochs@...dia.com>,
<jgg@...pe.ca>, <skolothumtho@...dia.com>, <akpm@...ux-foundation.org>,
<linmiaohe@...wei.com>, <nao.horiguchi@...il.com>, <cjia@...dia.com>,
<zhiw@...dia.com>, <kjaju@...dia.com>, <yishaih@...dia.com>,
<kevin.tian@...el.com>, <kvm@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
Subject: Re: [PATCH v2 3/3] vfio/nvgrace-gpu: register device memory for
poison handling
On Sat, 13 Dec 2025 04:47:08 +0000
<ankita@...dia.com> wrote:
> From: Ankit Agrawal <ankita@...dia.com>
>
> The nvgrace-gpu module [1] maps the device memory to the user VA (Qemu)
> without adding the memory to the kernel. The device memory pages are PFNMAP
> and not backed by struct page. The module can thus utilize the MM's PFNMAP
> memory_failure mechanism that handles ECC/poison on regions with no struct
> pages.
>
> The kernel MM code exposes register/unregister APIs allowing modules to
> register the device memory for memory_failure handling. Make nvgrace-gpu
> register the GPU memory with the MM on open.
>
> The module registers its memory region, the address_space with the
> kernel MM for ECC handling and implements a callback function to convert
> the PFN to the file page offset. The callback functions checks if the
> PFN belongs to the device memory region and is also contained in the
> VMA range, an error is returned otherwise.
>
> Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1]
>
> Suggested-by: Alex Williamson <alex@...zbot.org>
> Suggested-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Ankit Agrawal <ankita@...dia.com>
> ---
> drivers/vfio/pci/nvgrace-gpu/main.c | 116 +++++++++++++++++++++++++++-
> 1 file changed, 112 insertions(+), 4 deletions(-)
I'm not sure where Andrew stands with this series going into v6.19-rc
via mm as an alternate fix to Linus' revert, but in case it's on the
table for that to happen:
Reviewed-by: Alex Williamson <alex@...zbot.org>
Otherwise let's get some mm buy-in for the front of the series and
maybe it should go in through vfio since nvgrace is the only user of
these interfaces currently. Thanks,
Alex
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 84d142a47ec6..91b4a3a135cf 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -9,6 +9,7 @@
> #include <linux/jiffies.h>
> #include <linux/pci-p2pdma.h>
> #include <linux/pm_runtime.h>
> +#include <linux/memory-failure.h>
>
> /*
> * The device memory usable to the workloads running in the VM is cached
> @@ -49,6 +50,7 @@ struct mem_region {
> void *memaddr;
> void __iomem *ioaddr;
> }; /* Base virtual address of the region */
> + struct pfn_address_space pfn_address_space;
> };
>
> struct nvgrace_gpu_pci_core_device {
> @@ -88,6 +90,83 @@ nvgrace_gpu_memregion(int index,
> return NULL;
> }
>
> +static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
> + unsigned int index,
> + unsigned long pfn,
> + pgoff_t *pfn_offset_in_region)
> +{
> + struct mem_region *region;
> + unsigned long start_pfn, num_pages;
> +
> + region = nvgrace_gpu_memregion(index, nvdev);
> + if (!region)
> + return -EINVAL;
> +
> + start_pfn = PHYS_PFN(region->memphys);
> + num_pages = region->memlength >> PAGE_SHIFT;
> +
> + if (pfn < start_pfn || pfn >= start_pfn + num_pages)
> + return -EFAULT;
> +
> + *pfn_offset_in_region = pfn - start_pfn;
> +
> + return 0;
> +}
> +
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);
> +
> +static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
> + unsigned long pfn,
> + pgoff_t *pgoff)
> +{
> + struct nvgrace_gpu_pci_core_device *nvdev;
> + unsigned int index =
> + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> + pgoff_t vma_offset_in_region = vma->vm_pgoff &
> + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> + pgoff_t pfn_offset_in_region;
> + int ret;
> +
> + nvdev = vma_to_nvdev(vma);
> + if (!nvdev)
> + return -ENOENT;
> +
> + ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
> + if (ret)
> + return ret;
> +
> + /* Ensure PFN is not before VMA's start within the region */
> + if (pfn_offset_in_region < vma_offset_in_region)
> + return -EFAULT;
> +
> + /* Calculate offset from VMA start */
> + *pgoff = vma->vm_pgoff +
> + (pfn_offset_in_region - vma_offset_in_region);
> +
> + return 0;
> +}
> +
> +static int
> +nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
> + struct mem_region *region)
> +{
> + int ret;
> + unsigned long pfn, nr_pages;
> +
> + pfn = PHYS_PFN(region->memphys);
> + nr_pages = region->memlength >> PAGE_SHIFT;
> +
> + region->pfn_address_space.node.start = pfn;
> + region->pfn_address_space.node.last = pfn + nr_pages - 1;
> + region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
> + region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
> +
> + ret = register_pfn_address_space(®ion->pfn_address_space);
> +
> + return ret;
> +}
> +
> static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
> {
> struct vfio_pci_core_device *vdev =
> @@ -114,14 +193,28 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
> * memory mapping.
> */
> ret = vfio_pci_core_setup_barmap(vdev, 0);
> - if (ret) {
> - vfio_pci_core_disable(vdev);
> - return ret;
> + if (ret)
> + goto error_exit;
> +
> + if (nvdev->resmem.memlength) {
> + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
> + if (ret && ret != -EOPNOTSUPP)
> + goto error_exit;
> }
>
> - vfio_pci_core_finish_enable(vdev);
> + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
> + if (ret && ret != -EOPNOTSUPP)
> + goto register_mem_failed;
>
> + vfio_pci_core_finish_enable(vdev);
> return 0;
> +
> +register_mem_failed:
> + if (nvdev->resmem.memlength)
> + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +error_exit:
> + vfio_pci_core_disable(vdev);
> + return ret;
> }
>
> static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
> @@ -130,6 +223,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
> container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
> core_device.vdev);
>
> + if (nvdev->resmem.memlength)
> + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +
> + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
> +
> /* Unmap the mapping to the device memory cached region */
> if (nvdev->usemem.memaddr) {
> memunmap(nvdev->usemem.memaddr);
> @@ -247,6 +345,16 @@ static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
> #endif
> };
>
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
> +{
> + /* Check if this VMA belongs to us */
> + if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
> + return NULL;
> +
> + return vma->vm_private_data;
> +}
> +
> static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
> struct vm_area_struct *vma)
> {
Powered by blists - more mailing lists