linux-kernel - Re: [PATCH v2 3/3] vfio/nvgrace-gpu: register device memory for poison handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251213170002.5babbf70.alex@shazbot.org>
Date: Sat, 13 Dec 2025 17:00:02 +0900
From: Alex Williamson <alex@...zbot.org>
To: <ankita@...dia.com>
Cc: <vsethi@...dia.com>, <jgg@...dia.com>, <mochs@...dia.com>,
 <jgg@...pe.ca>, <skolothumtho@...dia.com>, <akpm@...ux-foundation.org>,
 <linmiaohe@...wei.com>, <nao.horiguchi@...il.com>, <cjia@...dia.com>,
 <zhiw@...dia.com>, <kjaju@...dia.com>, <yishaih@...dia.com>,
 <kevin.tian@...el.com>, <kvm@...r.kernel.org>,
 <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
Subject: Re: [PATCH v2 3/3] vfio/nvgrace-gpu: register device memory for
 poison handling

On Sat, 13 Dec 2025 04:47:08 +0000
<ankita@...dia.com> wrote:

> From: Ankit Agrawal <ankita@...dia.com>
> 
> The nvgrace-gpu module [1] maps the device memory to the user VA (Qemu)
> without adding the memory to the kernel. The device memory pages are PFNMAP
> and not backed by struct page. The module can thus utilize the MM's PFNMAP
> memory_failure mechanism that handles ECC/poison on regions with no struct
> pages.
> 
> The kernel MM code exposes register/unregister APIs allowing modules to
> register the device memory for memory_failure handling. Make nvgrace-gpu
> register the GPU memory with the MM on open.
> 
> The module registers its memory region, the address_space with the
> kernel MM for ECC handling and implements a callback function to convert
> the PFN to the file page offset. The callback functions checks if the
> PFN belongs to the device memory region and is also contained in the
> VMA range, an error is returned otherwise.
> 
> Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1]
> 
> Suggested-by: Alex Williamson <alex@...zbot.org>
> Suggested-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Ankit Agrawal <ankita@...dia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 116 +++++++++++++++++++++++++++-
>  1 file changed, 112 insertions(+), 4 deletions(-)

I'm not sure where Andrew stands with this series going into v6.19-rc
via mm as an alternate fix to Linus' revert, but in case it's on the
table for that to happen:

Reviewed-by: Alex Williamson <alex@...zbot.org>

Otherwise let's get some mm buy-in for the front of the series and
maybe it should go in through vfio since nvgrace is the only user of
these interfaces currently.  Thanks,

Alex

> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 84d142a47ec6..91b4a3a135cf 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -9,6 +9,7 @@
>  #include <linux/jiffies.h>
>  #include <linux/pci-p2pdma.h>
>  #include <linux/pm_runtime.h>
> +#include <linux/memory-failure.h>
>  
>  /*
>   * The device memory usable to the workloads running in the VM is cached
> @@ -49,6 +50,7 @@ struct mem_region {
>  		void *memaddr;
>  		void __iomem *ioaddr;
>  	};                      /* Base virtual address of the region */
> +	struct pfn_address_space pfn_address_space;
>  };
>  
>  struct nvgrace_gpu_pci_core_device {
> @@ -88,6 +90,83 @@ nvgrace_gpu_memregion(int index,
>  	return NULL;
>  }
>  
> +static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
> +				unsigned int index,
> +				unsigned long pfn,
> +				pgoff_t *pfn_offset_in_region)
> +{
> +	struct mem_region *region;
> +	unsigned long start_pfn, num_pages;
> +
> +	region = nvgrace_gpu_memregion(index, nvdev);
> +	if (!region)
> +		return -EINVAL;
> +
> +	start_pfn = PHYS_PFN(region->memphys);
> +	num_pages = region->memlength >> PAGE_SHIFT;
> +
> +	if (pfn < start_pfn || pfn >= start_pfn + num_pages)
> +		return -EFAULT;
> +
> +	*pfn_offset_in_region = pfn - start_pfn;
> +
> +	return 0;
> +}
> +
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);
> +
> +static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
> +					unsigned long pfn,
> +					pgoff_t *pgoff)
> +{
> +	struct nvgrace_gpu_pci_core_device *nvdev;
> +	unsigned int index =
> +		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> +	pgoff_t vma_offset_in_region = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +	pgoff_t pfn_offset_in_region;
> +	int ret;
> +
> +	nvdev = vma_to_nvdev(vma);
> +	if (!nvdev)
> +		return -ENOENT;
> +
> +	ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
> +	if (ret)
> +		return ret;
> +
> +	/* Ensure PFN is not before VMA's start within the region */
> +	if (pfn_offset_in_region < vma_offset_in_region)
> +		return -EFAULT;
> +
> +	/* Calculate offset from VMA start */
> +	*pgoff = vma->vm_pgoff +
> +		 (pfn_offset_in_region - vma_offset_in_region);
> +
> +	return 0;
> +}
> +
> +static int
> +nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
> +					struct mem_region *region)
> +{
> +	int ret;
> +	unsigned long pfn, nr_pages;
> +
> +	pfn = PHYS_PFN(region->memphys);
> +	nr_pages = region->memlength >> PAGE_SHIFT;
> +
> +	region->pfn_address_space.node.start = pfn;
> +	region->pfn_address_space.node.last = pfn + nr_pages - 1;
> +	region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
> +	region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
> +
> +	ret = register_pfn_address_space(&region->pfn_address_space);
> +
> +	return ret;
> +}
> +
>  static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
>  {
>  	struct vfio_pci_core_device *vdev =
> @@ -114,14 +193,28 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
>  	 * memory mapping.
>  	 */
>  	ret = vfio_pci_core_setup_barmap(vdev, 0);
> -	if (ret) {
> -		vfio_pci_core_disable(vdev);
> -		return ret;
> +	if (ret)
> +		goto error_exit;
> +
> +	if (nvdev->resmem.memlength) {
> +		ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
> +		if (ret && ret != -EOPNOTSUPP)
> +			goto error_exit;
>  	}
>  
> -	vfio_pci_core_finish_enable(vdev);
> +	ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
> +	if (ret && ret != -EOPNOTSUPP)
> +		goto register_mem_failed;
>  
> +	vfio_pci_core_finish_enable(vdev);
>  	return 0;
> +
> +register_mem_failed:
> +	if (nvdev->resmem.memlength)
> +		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +error_exit:
> +	vfio_pci_core_disable(vdev);
> +	return ret;
>  }
>  
>  static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
> @@ -130,6 +223,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
>  		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
>  			     core_device.vdev);
>  
> +	if (nvdev->resmem.memlength)
> +		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
> +
> +	unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
> +
>  	/* Unmap the mapping to the device memory cached region */
>  	if (nvdev->usemem.memaddr) {
>  		memunmap(nvdev->usemem.memaddr);
> @@ -247,6 +345,16 @@ static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
>  #endif
>  };
>  
> +static inline
> +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
> +{
> +	/* Check if this VMA belongs to us */
> +	if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
> +		return NULL;
> +
> +	return vma->vm_private_data;
> +}
> +
>  static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
>  			    struct vm_area_struct *vma)
>  {