[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251127170632.3477-7-ankita@nvidia.com>
Date: Thu, 27 Nov 2025 17:06:32 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...pe.ca>, <yishaih@...dia.com>,
<skolothumtho@...dia.com>, <kevin.tian@...el.com>, <alex@...zbot.org>,
<aniketa@...dia.com>, <vsethi@...dia.com>, <mochs@...dia.com>
CC: <Yunxiang.Li@....com>, <yi.l.liu@...el.com>,
<zhangdongdong@...incomputing.com>, <avihaih@...dia.com>,
<bhelgaas@...gle.com>, <peterx@...hat.com>, <pstanner@...hat.com>,
<apopple@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<cjia@...dia.com>, <kwankhede@...dia.com>, <targupta@...dia.com>,
<zhiw@...dia.com>, <danw@...dia.com>, <dnigam@...dia.com>, <kjaju@...dia.com>
Subject: [PATCH v9 6/6] vfio/nvgrace-gpu: wait for the GPU mem to be ready
From: Ankit Agrawal <ankita@...dia.com>
Speculative prefetches from CPU to GPU memory until the GPU is
ready after reset can cause harmless corrected RAS events to
be logged on Grace systems. It is thus preferred that the
mapping not be re-established until the GPU is ready post reset.
The GPU readiness can be checked through BAR0 registers similar
to the checking at the time of device probe.
It can take several seconds for the GPU to be ready. So it is
desirable that the time overlaps as much of the VM startup as
possible to reduce impact on the VM bootup time. The GPU
readiness state is thus checked on the first fault/huge_fault
request or read/write access which amortizes the GPU readiness
time.
The first fault and read/write checks the GPU state when the
reset_done flag - which denotes whether the GPU has just been
reset. The memory_lock is taken across map/access to avoid
races with GPU reset.
Also check if the memory is enabled, before waiting for GPU
to be ready. Otherwise the readiness check would block for 30s.
Lastly added PM handling wrapping on read/write access.
Cc: Shameer Kolothum <skolothumtho@...dia.com>
Cc: Alex Williamson <alex@...zbot.org>
Cc: Jason Gunthorpe <jgg@...pe.ca>
Cc: Vikram Sethi <vsethi@...dia.com>
Reviewed-by: Shameer Kolothum <skolothumtho@...dia.com>
Suggested-by: Alex Williamson <alex@...zbot.org>
Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
drivers/vfio/pci/nvgrace-gpu/main.c | 103 ++++++++++++++++++++++++----
drivers/vfio/pci/vfio_pci_config.c | 1 +
drivers/vfio/pci/vfio_pci_priv.h | 1 -
include/linux/vfio_pci_core.h | 1 +
4 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index bf0a3b65c72e..bf54d1c6bd73 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,6 +7,7 @@
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -104,6 +105,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
mutex_init(&nvdev->remap_lock);
}
+ /*
+ * GPU readiness is checked by reading the BAR0 registers.
+ *
+ * ioremap BAR0 to ensure that the BAR0 mapping is present before
+ * register reads on first fault before establishing any GPU
+ * memory mapping.
+ */
+ ret = vfio_pci_core_setup_barmap(vdev, 0);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+
vfio_pci_core_finish_enable(vdev);
return 0;
@@ -146,6 +160,34 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
return -ETIME;
}
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
+
+ lockdep_assert_held_read(&vdev->memory_lock);
+
+ if (!nvdev->reset_done)
+ return 0;
+
+ if (!__vfio_pci_memory_enabled(vdev))
+ return -EIO;
+
+ ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+ if (ret)
+ return ret;
+
+ nvdev->reset_done = false;
+
+ return 0;
+}
+
static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -175,8 +217,13 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
if (is_aligned_for_order(vma, addr, pfn, order)) {
- scoped_guard(rwsem_read, &vdev->memory_lock)
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ if (vdev->pm_runtime_engaged ||
+ nvgrace_gpu_check_device_ready(nvdev))
+ return VM_FAULT_SIGBUS;
+
ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+ }
}
dev_dbg_ratelimited(&vdev->pdev->dev,
@@ -563,6 +610,7 @@ static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
char __user *buf, size_t count, loff_t *ppos)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct mem_region *memregion;
@@ -589,9 +637,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
else
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
/*
* Only the device memory present on the hardware is mapped, which may
@@ -616,9 +670,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
@@ -680,6 +741,7 @@ static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
size_t count, loff_t *ppos, const char __user *buf)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
struct mem_region *memregion;
@@ -709,9 +771,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
*/
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
exitfn:
*ppos += count;
@@ -725,10 +793,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
@@ -1051,7 +1126,11 @@ MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
* faults and read/writes accesses to prevent potential RAS events logging.
*
* First fault or access after a reset needs to poll device readiness,
- * flag that a reset has occurred.
+ * flag that a reset has occurred. The readiness test is done by holding
+ * the memory_lock read lock and we expect all vfio-pci initiated resets to
+ * hold the memory_lock write lock to avoid races. However, .reset_done
+ * extends beyond the scope of vfio-pci initiated resets therefore we
+ * cannot assert this behavior and use lockdep_assert_held_write.
*/
static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
{
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 333fd149c21a..d0cc5f04a8f2 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
return pdev->current_state < PCI_D3hot &&
(pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
}
+EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled);
/*
* Restore the *real* BARs after we detect a FLR or backdoor reset.
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..7b1776bae802 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -60,7 +60,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev);
int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state);
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 3117a390c4eb..6db13f66b5e4 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -137,6 +137,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
loff_t reg_start, size_t reg_cnt,
loff_t *buf_offset,
--
2.34.1
Powered by blists - more mailing lists