linux-kernel - [PATCH v2 6/6] vfio/nvgrace-gpu: wait for the GPU mem to be ready

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251118074422.58081-7-ankita@nvidia.com>
Date: Tue, 18 Nov 2025 07:44:22 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...pe.ca>, <yishaih@...dia.com>,
	<skolothumtho@...dia.com>, <kevin.tian@...el.com>, <alex@...zbot.org>,
	<aniketa@...dia.com>, <vsethi@...dia.com>, <mochs@...dia.com>
CC: <Yunxiang.Li@....com>, <yi.l.liu@...el.com>,
	<zhangdongdong@...incomputing.com>, <avihaih@...dia.com>,
	<bhelgaas@...gle.com>, <peterx@...hat.com>, <pstanner@...hat.com>,
	<apopple@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<cjia@...dia.com>, <kwankhede@...dia.com>, <targupta@...dia.com>,
	<zhiw@...dia.com>, <danw@...dia.com>, <dnigam@...dia.com>, <kjaju@...dia.com>
Subject: [PATCH v2 6/6] vfio/nvgrace-gpu: wait for the GPU mem to be ready

From: Ankit Agrawal <ankita@...dia.com>

Speculative prefetches from CPU to GPU memory until the GPU
is not ready after reset can cause harmless corrected RAS events
to be logged. It is thus expected that the mapping not be
re-established until the GPU is ready post reset.

Wait for the GPU to be ready on the first fault before establishing
CPU mapping to the GPU memory. The GPU readiness can be checked
through BAR0 registers as is already being done at the device probe.

The state is checked on the first fault/huge_fault request using
a flag. Unset the flag on every reset request.

So intercept the following calls to the GPU reset, unset
gpu_mem_mapped. Then use it to determine whether to wait before
mapping.
1. VFIO_DEVICE_RESET ioctl call
2. FLR through config space.

cc: Alex Williamson <alex@...zbot.org>
cc: Jason Gunthorpe <jgg@...pe.ca>
cc: Vikram Sethi <vsethi@...dia.com>
Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 7618c3f515cc..5d7bf5b1e7a2 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -58,6 +58,8 @@ struct nvgrace_gpu_pci_core_device {
 	/* Lock to control device memory kernel mapping */
 	struct mutex remap_lock;
 	bool has_mig_hw_bug;
+	/* Any GPU memory mapped to the VMA */
+	bool gpu_mem_mapped;
 };
 
 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -102,6 +104,8 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
 		mutex_init(&nvdev->remap_lock);
 	}
 
+	nvdev->gpu_mem_mapped = false;
+
 	vfio_pci_core_finish_enable(vdev);
 
 	return 0;
@@ -158,6 +162,24 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 	struct mem_region *memregion;
 	unsigned long pgoff, pfn, addr;
 
+	/*
+	 * If the GPU memory is accessed by the CPU while the GPU is
+	 * not ready after reset, it can cause harmless corrected RAS
+	 * events to be logged. Make sure the GPU is ready before
+	 * establishing the mappings.
+	 */
+	if (!nvdev->gpu_mem_mapped) {
+		struct vfio_pci_core_device *vdev = &nvdev->core_device;
+
+		if (!vdev->barmap[0])
+			return VM_FAULT_SIGBUS;
+
+		if (nvgrace_gpu_wait_device_ready(vdev->barmap[0]))
+			return VM_FAULT_SIGBUS;
+
+		nvdev->gpu_mem_mapped = true;
+	}
+
 	memregion = nvgrace_gpu_memregion(index, nvdev);
 	if (!memregion)
 		return ret;
@@ -354,7 +376,17 @@ static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
 	case VFIO_DEVICE_IOEVENTFD:
 		return -ENOTTY;
 	case VFIO_DEVICE_RESET:
+		struct nvgrace_gpu_pci_core_device *nvdev =
+			container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+				     core_device.vdev);
 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+
+		/*
+		 * GPU memory is exposed as device BAR2 (region 4,5).
+		 * This would be zapped during GPU reset. Unset
+		 * nvdev->gpu_mem_mapped to reflect just that.
+		 */
+		nvdev->gpu_mem_mapped = false;
 		fallthrough;
 	default:
 		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
@@ -439,11 +471,14 @@ nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
 	struct nvgrace_gpu_pci_core_device *nvdev =
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
 	struct mem_region *memregion = NULL;
 	size_t register_offset;
 	loff_t copy_offset;
 	size_t copy_count;
+	int cap_start = vfio_find_cap_start(vdev, pos);
 
 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
 						sizeof(u64), &copy_offset,
@@ -462,6 +497,23 @@ nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
 		return copy_count;
 	}
 
+	if (vfio_pci_core_range_intersect_range(pos, count, cap_start + PCI_EXP_DEVCTL,
+						sizeof(u16), &copy_offset,
+						&copy_count, &register_offset)) {
+		__le16 val16;
+
+		if (copy_from_user((void *)&val16, buf, copy_count))
+			return -EFAULT;
+
+		/*
+		 * GPU memory is exposed as device BAR2 (region 4,5).
+		 * This would be zapped during GPU reset. Unset
+		 * nvdev->gpu_mem_mapped to reflect just that.
+		 */
+		if (val16 & cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR))
+			nvdev->gpu_mem_mapped = false;
+	}
+
 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
 }
 
-- 
2.34.1