linux-kernel - [PATCH v7 6/6] vfio/nvgrace-gpu: wait for the GPU mem to be ready

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251126052627.43335-7-ankita@nvidia.com>
Date: Wed, 26 Nov 2025 05:26:27 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...pe.ca>, <yishaih@...dia.com>,
	<skolothumtho@...dia.com>, <kevin.tian@...el.com>, <alex@...zbot.org>,
	<aniketa@...dia.com>, <vsethi@...dia.com>, <mochs@...dia.com>
CC: <Yunxiang.Li@....com>, <yi.l.liu@...el.com>,
	<zhangdongdong@...incomputing.com>, <avihaih@...dia.com>,
	<bhelgaas@...gle.com>, <peterx@...hat.com>, <pstanner@...hat.com>,
	<apopple@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<cjia@...dia.com>, <kwankhede@...dia.com>, <targupta@...dia.com>,
	<zhiw@...dia.com>, <danw@...dia.com>, <dnigam@...dia.com>, <kjaju@...dia.com>
Subject: [PATCH v7 6/6] vfio/nvgrace-gpu: wait for the GPU mem to be ready

From: Ankit Agrawal <ankita@...dia.com>

Speculative prefetches from CPU to GPU memory until the GPU is
ready after reset can cause harmless corrected RAS events to
be logged on Grace systems. It is thus preferred that the
mapping not be re-established until the GPU is ready post reset.

The GPU readiness can be checked through BAR0 registers similar
to the checking at the time of device probe.

It can take several seconds for the GPU to be ready. So it is
desirable that the time overlaps as much of the VM startup as
possible to reduce impact on the VM bootup time. The GPU
readiness state is thus checked on the first fault/huge_fault
request or read/write access which amortizes the GPU readiness
time.

The first fault and read/write checks the GPU state when the
reset_done flag - which denotes whether the GPU has just been
reset. The memory_lock is taken across map/access to avoid
races with GPU reset.

Cc: Shameer Kolothum <skolothumtho@...dia.com>
Cc: Alex Williamson <alex@...zbot.org>
Cc: Jason Gunthorpe <jgg@...pe.ca>
Cc: Vikram Sethi <vsethi@...dia.com>
Suggested-by: Alex Williamson <alex@...zbot.org>
Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 81 +++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b46984e76be7..3064f8aca858 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -104,6 +104,17 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
 		mutex_init(&nvdev->remap_lock);
 	}
 
+	/*
+	 * GPU readiness is checked by reading the BAR0 registers.
+	 *
+	 * ioremap BAR0 to ensure that the BAR0 mapping is present before
+	 * register reads on first fault before establishing any GPU
+	 * memory mapping.
+	 */
+	ret = vfio_pci_core_setup_barmap(vdev, 0);
+	if (ret)
+		return ret;
+
 	vfio_pci_core_finish_enable(vdev);
 
 	return 0;
@@ -146,6 +157,31 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
 	return -ETIME;
 }
 
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
+	int ret;
+
+	lockdep_assert_held_read(&vdev->memory_lock);
+
+	if (!nvdev->reset_done)
+		return 0;
+
+	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+	if (ret)
+		return ret;
+
+	nvdev->reset_done = false;
+
+	return 0;
+}
+
 static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
 				   unsigned long addr)
 {
@@ -163,13 +199,13 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 	unsigned int index =
 		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
-	vm_fault_t ret = VM_FAULT_SIGBUS;
+	vm_fault_t ret;
 	struct mem_region *memregion;
 	unsigned long pfn, addr;
 
 	memregion = nvgrace_gpu_memregion(index, nvdev);
 	if (!memregion)
-		return ret;
+		return VM_FAULT_SIGBUS;
 
 	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
@@ -179,8 +215,14 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 		goto out;
 	}
 
-	scoped_guard(rwsem_read, &vdev->memory_lock)
+	scoped_guard(rwsem_read, &vdev->memory_lock) {
+		if (nvgrace_gpu_check_device_ready(nvdev)) {
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+
 		ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+	}
 
 out:
 	dev_dbg_ratelimited(&vdev->pdev->dev,
@@ -593,9 +635,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	else
 		mem_count = min(count, memregion->memlength - (size_t)offset);
 
-	ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
-	if (ret)
-		return ret;
+	scoped_guard(rwsem_read, &nvdev->core_device.memory_lock) {
+		ret = nvgrace_gpu_check_device_ready(nvdev);
+		if (ret)
+			return ret;
+
+		ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Only the device memory present on the hardware is mapped, which may
@@ -713,9 +761,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	 */
 	mem_count = min(count, memregion->memlength - (size_t)offset);
 
-	ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
-	if (ret)
-		return ret;
+	scoped_guard(rwsem_read, &nvdev->core_device.memory_lock) {
+		ret = nvgrace_gpu_check_device_ready(nvdev);
+		if (ret)
+			return ret;
+
+		ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+		if (ret)
+			return ret;
+	}
 
 exitfn:
 	*ppos += count;
@@ -1056,6 +1110,15 @@ MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
  *
  * The reset_done implementation is triggered on every reset and is used
  * set the reset_done variable that assists in achieving the serialization.
+ *
+ * The writer memory_lock is held during reset through ioctl and FLR, while
+ * the reader is held in the fault and access code.
+ *
+ * The lock is however not taken during reset called through
+ * vfio_pci_core_enable during open. Whilst a serialization is not
+ * required at that early stage, it still prevents from putting
+ * lockdep_assert_held_write in this function.
+ *
  */
 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
 {
-- 
2.34.1