linux-kernel - [PATCH v3 7/7] vfio/nvgrace-gpu: wait for the GPU mem to be ready

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251121141141.3175-8-ankita@nvidia.com>
Date: Fri, 21 Nov 2025 14:11:41 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...pe.ca>, <yishaih@...dia.com>,
	<skolothumtho@...dia.com>, <kevin.tian@...el.com>, <alex@...zbot.org>,
	<aniketa@...dia.com>, <vsethi@...dia.com>, <mochs@...dia.com>
CC: <Yunxiang.Li@....com>, <yi.l.liu@...el.com>,
	<zhangdongdong@...incomputing.com>, <avihaih@...dia.com>,
	<bhelgaas@...gle.com>, <peterx@...hat.com>, <pstanner@...hat.com>,
	<apopple@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<cjia@...dia.com>, <kwankhede@...dia.com>, <targupta@...dia.com>,
	<zhiw@...dia.com>, <danw@...dia.com>, <dnigam@...dia.com>, <kjaju@...dia.com>
Subject: [PATCH v3 7/7] vfio/nvgrace-gpu: wait for the GPU mem to be ready

From: Ankit Agrawal <ankita@...dia.com>

Speculative prefetches from CPU to GPU memory until the GPU is
ready after reset can cause harmless corrected RAS events to
be logged on Grace systems. It is thus preferred that the
mapping not be re-established until the GPU is ready post reset.

The GPU readiness can be checked through BAR0 registers similar
to the checking at the time of device probe.

It can take several seconds for the GPU to be ready. So it is
desirable that the time overlaps as much of the VM startup as
possible to reduce impact on the VM bootup time. The GPU
readiness state is thus checked on the first fault/huge_fault
request which amortizes the GPU readiness time. The first fault
is checked using a flag. The flag is unset on every GPU reset
request.

Intercept the following calls to the GPU reset, unset gpu_mem_mapped.
Then use it to determine whether to wait before mapping.
1. VFIO_DEVICE_RESET ioctl call
2. FLR through config space.

cc: Alex Williamson <alex@...zbot.org>
cc: Jason Gunthorpe <jgg@...pe.ca>
cc: Vikram Sethi <vsethi@...dia.com>
Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 64 ++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 7618c3f515cc..23e3278aba25 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -58,6 +58,8 @@ struct nvgrace_gpu_pci_core_device {
 	/* Lock to control device memory kernel mapping */
 	struct mutex remap_lock;
 	bool has_mig_hw_bug;
+	/* Any GPU memory mapped to the VMA */
+	bool gpu_mem_mapped;
 };
 
 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -102,9 +104,15 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
 		mutex_init(&nvdev->remap_lock);
 	}
 
+	nvdev->gpu_mem_mapped = false;
+
 	vfio_pci_core_finish_enable(vdev);
 
-	return 0;
+	/*
+	 * The GPU readiness is determined through BAR0 register reads.
+	 * Make sure the BAR0 is mapped before any such check occur.
+	 */
+	return vfio_pci_core_barmap(vdev, 0);
 }
 
 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
@@ -158,6 +166,21 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 	struct mem_region *memregion;
 	unsigned long pgoff, pfn, addr;
 
+	/*
+	 * If the GPU memory is accessed by the CPU while the GPU is
+	 * not ready after reset, it can cause harmless corrected RAS
+	 * events to be logged. Make sure the GPU is ready before
+	 * establishing the mappings.
+	 */
+	if (!nvdev->gpu_mem_mapped) {
+		struct vfio_pci_core_device *vdev = &nvdev->core_device;
+
+		if (nvgrace_gpu_wait_device_ready(vdev->barmap[0]))
+			return VM_FAULT_SIGBUS;
+
+		nvdev->gpu_mem_mapped = true;
+	}
+
 	memregion = nvgrace_gpu_memregion(index, nvdev);
 	if (!memregion)
 		return ret;
@@ -354,7 +377,17 @@ static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
 	case VFIO_DEVICE_IOEVENTFD:
 		return -ENOTTY;
 	case VFIO_DEVICE_RESET:
+		struct nvgrace_gpu_pci_core_device *nvdev =
+			container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+				     core_device.vdev);
 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+
+		/*
+		 * GPU memory is exposed as device BAR2 (region 4,5).
+		 * This would be zapped during GPU reset. Unset
+		 * nvdev->gpu_mem_mapped to reflect just that.
+		 */
+		nvdev->gpu_mem_mapped = false;
 		fallthrough;
 	default:
 		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
@@ -439,11 +472,14 @@ nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
 	struct nvgrace_gpu_pci_core_device *nvdev =
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
 	struct mem_region *memregion = NULL;
 	size_t register_offset;
 	loff_t copy_offset;
 	size_t copy_count;
+	int cap_start = vfio_find_cap_start(vdev, pos);
 
 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
 						sizeof(u64), &copy_offset,
@@ -462,6 +498,23 @@ nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
 		return copy_count;
 	}
 
+	if (vfio_pci_core_range_intersect_range(pos, count, cap_start + PCI_EXP_DEVCTL,
+						sizeof(u16), &copy_offset,
+						&copy_count, &register_offset)) {
+		__le16 val16;
+
+		if (copy_from_user((void *)&val16, buf, copy_count))
+			return -EFAULT;
+
+		/*
+		 * GPU memory is exposed as device BAR2 (region 4,5).
+		 * This would be zapped during GPU reset. Unset
+		 * nvdev->gpu_mem_mapped to reflect just that.
+		 */
+		if (val16 & cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR))
+			nvdev->gpu_mem_mapped = false;
+	}
+
 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
 }
 
@@ -478,9 +531,18 @@ static int
 nvgrace_gpu_map_device_mem(int index,
 			   struct nvgrace_gpu_pci_core_device *nvdev)
 {
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 	struct mem_region *memregion;
 	int ret = 0;
 
+	if (!nvdev->gpu_mem_mapped) {
+		ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+		if (ret)
+			return ret;
+
+		nvdev->gpu_mem_mapped = true;
+	}
+
 	memregion = nvgrace_gpu_memregion(index, nvdev);
 	if (!memregion)
 		return -EINVAL;
-- 
2.34.1