linux-kernel - Re: [PATCH v1 2/2] vfio/nvgrace-gpu: register device memory for poison handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACw3F52rHjxv8gWzz6_YdR038CiA1=JxUD6YuW4As=rQ2oMdag@mail.gmail.com>
Date: Thu, 8 Jan 2026 09:00:19 -0800
From: Jiaqi Yan <jiaqiyan@...gle.com>
To: ankita@...dia.com
Cc: vsethi@...dia.com, jgg@...dia.com, mochs@...dia.com, jgg@...pe.ca, 
	skolothumtho@...dia.com, alex@...zbot.org, linmiaohe@...wei.com, 
	nao.horiguchi@...il.com, cjia@...dia.com, zhiw@...dia.com, kjaju@...dia.com, 
	yishaih@...dia.com, kevin.tian@...el.com, kvm@...r.kernel.org, 
	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v1 2/2] vfio/nvgrace-gpu: register device memory for
 poison handling

On Thu, Jan 8, 2026 at 7:36 AM <ankita@...dia.com> wrote:
>
> From: Ankit Agrawal <ankita@...dia.com>
>
> The nvgrace-gpu module [1] maps the device memory to the user VA (Qemu)
> without adding the memory to the kernel. The device memory pages are PFNMAP
> and not backed by struct page. The module can thus utilize the MM's PFNMAP
> memory_failure mechanism that handles ECC/poison on regions with no struct
> pages.
>
> The kernel MM code exposes register/unregister APIs allowing modules to
> register the device memory for memory_failure handling. Make nvgrace-gpu
> register the GPU memory with the MM on open.
>
> The module registers its memory region, the address_space with the
> kernel MM for ECC handling and implements a callback function to convert
> the PFN to the file page offset. The callback functions checks if the
> PFN belongs to the device memory region and is also contained in the
> VMA range, an error is returned otherwise.
>
> Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1]
>
> Suggested-by: Alex Williamson <alex@...zbot.org>
> Suggested-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Ankit Agrawal <ankita@...dia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 116 +++++++++++++++++++++++++++-
>  1 file changed, 112 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index b45a24d00387..d3e5fee29180 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -9,6 +9,7 @@
>  #include <linux/jiffies.h>
>  #include <linux/pci-p2pdma.h>
>  #include <linux/pm_runtime.h>
> +#include <linux/memory-failure.h>
>
>  /*
>   * The device memory usable to the workloads running in the VM is cached
> @@ -49,6 +50,7 @@ struct mem_region {
>                 void *memaddr;
>                 void __iomem *ioaddr;
>         };                      /* Base virtual address of the region */
> +       struct pfn_address_space pfn_address_space;
>  };
>
>  struct nvgrace_gpu_pci_core_device {
> @@ -88,6 +90,83 @@ nvgrace_gpu_memregion(int index,
>         return NULL;
>  }
>
> +static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
> +                               unsigned int index,
> +                               unsigned long pfn,
> +                               pgoff_t *pfn_offset_in_region)
> +{
> +       struct mem_region *region;
> +       unsigned long start_pfn, num_pages;
> +
> +       region = nvgrace_gpu_memregion(index, nvdev);
> +       if (!region)
> +               return -EINVAL;
> +
> +       start_pfn = PHYS_PFN(region->memphys);
> +       num_pages = region->memlength >> PAGE_SHIFT;
> +
> +       if (pfn < start_pfn || pfn >= start_pfn + num_pages)
> +               return -EFAULT;
> +
> +       *pfn_offset_in_region = pfn - start_pfn;
> +
> +       return 0;
> +}
> +
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);

Any reason not to define vma_to_nvdev() here directly, but later?

> +
> +static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
> +                                       unsigned long pfn,
> +                                       pgoff_t *pgoff)
> +{
> +       struct nvgrace_gpu_pci_core_device *nvdev;
> +       unsigned int index =
> +               vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> +       pgoff_t vma_offset_in_region = vma->vm_pgoff &
> +               ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +       pgoff_t pfn_offset_in_region;
> +       int ret;
> +
> +       nvdev = vma_to_nvdev(vma);
> +       if (!nvdev)
> +               return -ENOENT;
> +
> +       ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
> +       if (ret)
> +               return ret;
> +
> +       /* Ensure PFN is not before VMA's start within the region */
> +       if (pfn_offset_in_region < vma_offset_in_region)
> +               return -EFAULT;
> +
> +       /* Calculate offset from VMA start */
> +       *pgoff = vma->vm_pgoff +
> +                (pfn_offset_in_region - vma_offset_in_region);
> +
> +       return 0;
> +}
> +
> +static int
> +nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
> +                                       struct mem_region *region)
> +{
> +       int ret;
> +       unsigned long pfn, nr_pages;
> +
> +       pfn = PHYS_PFN(region->memphys);
> +       nr_pages = region->memlength >> PAGE_SHIFT;
> +
> +       region->pfn_address_space.node.start = pfn;
> +       region->pfn_address_space.node.last = pfn + nr_pages - 1;
> +       region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
> +       region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
> +
> +       ret = register_pfn_address_space(&region->pfn_address_space);
> +
> +       return ret;

nit: I believe "ret" is unnecessary here.


> +}
> +
>  static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
>  {
>         struct vfio_pci_core_device *vdev =
> @@ -114,14 +193,28 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
>          * memory mapping.
>          */
>         ret = vfio_pci_core_setup_barmap(vdev, 0);
> -       if (ret) {
> -               vfio_pci_core_disable(vdev);
> -               return ret;
> +       if (ret)
> +               goto error_exit;
> +
> +       if (nvdev->resmem.memlength) {
> +               ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
> +               if (ret && ret != -EOPNOTSUPP)
> +                       goto error_exit;
>         }
>
> -       vfio_pci_core_finish_enable(vdev);
> +       ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
> +       if (ret && ret != -EOPNOTSUPP)
> +               goto register_mem_failed;
>
> +       vfio_pci_core_finish_enable(vdev);
>         return 0;
> +
> +register_mem_failed:
> +       if (nvdev->resmem.memlength)
> +               unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +error_exit:
> +       vfio_pci_core_disable(vdev);
> +       return ret;
>  }
>
>  static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
> @@ -130,6 +223,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
>                 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
>                              core_device.vdev);
>
> +       if (nvdev->resmem.memlength)
> +               unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +
> +       unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
> +
>         /* Unmap the mapping to the device memory cached region */
>         if (nvdev->usemem.memaddr) {
>                 memunmap(nvdev->usemem.memaddr);
> @@ -247,6 +345,16 @@ static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
>  #endif
>  };
>
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
> +{
> +       /* Check if this VMA belongs to us */
> +       if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
> +               return NULL;
> +
> +       return vma->vm_private_data;
> +}
> +
>  static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
>                             struct vm_area_struct *vma)
>  {
> --
> 2.34.1
>
>