[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250109142123.3537519a.alex.williamson@redhat.com>
Date: Thu, 9 Jan 2025 14:21:23 -0500
From: Alex Williamson <alex.williamson@...hat.com>
To: <ankita@...dia.com>
Cc: <jgg@...dia.com>, <yishaih@...dia.com>,
<shameerali.kolothum.thodi@...wei.com>, <kevin.tian@...el.com>,
<zhiw@...dia.com>, <aniketa@...dia.com>, <cjia@...dia.com>,
<kwankhede@...dia.com>, <targupta@...dia.com>, <vsethi@...dia.com>,
<acurrid@...dia.com>, <apopple@...dia.com>, <jhubbard@...dia.com>,
<danw@...dia.com>, <anuaggarwal@...dia.com>, <mochs@...dia.com>,
<kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 2/3] vfio/nvgrace-gpu: Expose the blackwell device PF
BAR1 to the VM
On Sun, 5 Jan 2025 17:36:14 +0000
<ankita@...dia.com> wrote:
> From: Ankit Agrawal <ankita@...dia.com>
>
> There is a HW defect on Grace Hopper (GH) to support the
> Multi-Instance GPU (MIG) feature [1] that necessiated the presence
> of a 1G region carved out from the device memory and mapped as
> uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3)
> to workaround the issue.
>
> The Grace Blackwell systems (GB) differ from GH systems in the following
> aspects:
> 1. The aforementioned HW defect is fixed on GB systems.
> 2. There is a usable BAR1 (region 2 and 3) on GB systems for the
> GPUdirect RDMA feature [2].
>
> This patch accommodate those GB changes by showing the 64b physical
> device BAR1 (region2 and 3) to the VM instead of the fake one. This
> takes care of both the differences.
>
> Moreover, the entire device memory is exposed on GB as cacheable to
> the VM as there is no carveout required.
>
> Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1]
> Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2]
>
> Signed-off-by: Ankit Agrawal <ankita@...dia.com>
> ---
> drivers/vfio/pci/nvgrace-gpu/main.c | 32 +++++++++++++++++++++--------
> 1 file changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 85eacafaffdf..44a276c886e1 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -72,7 +72,7 @@ nvgrace_gpu_memregion(int index,
> if (index == USEMEM_REGION_INDEX)
> return &nvdev->usemem;
>
> - if (index == RESMEM_REGION_INDEX)
> + if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX)
> return &nvdev->resmem;
>
> return NULL;
> @@ -715,6 +715,16 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
> .detach_ioas = vfio_iommufd_physical_detach_ioas,
> };
>
> +static void
> +nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> + struct nvgrace_gpu_pci_core_device *nvdev,
> + u64 memphys, u64 memlength)
> +{
> + nvdev->usemem.memphys = memphys;
> + nvdev->usemem.memlength = memlength;
> + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
> +}
> +
> static int
> nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
> u64 *pmemphys, u64 *pmemlength)
> @@ -752,9 +762,9 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
> }
>
> static int
> -nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> - struct nvgrace_gpu_pci_core_device *nvdev,
> - u64 memphys, u64 memlength)
> +nvgrace_gpu_nvdev_struct_workaround(struct pci_dev *pdev,
> + struct nvgrace_gpu_pci_core_device *nvdev,
> + u64 memphys, u64 memlength)
> {
> int ret = 0;
>
> @@ -864,10 +874,16 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
> * Device memory properties are identified in the host ACPI
> * table. Set the nvgrace_gpu_pci_core_device structure.
> */
> - ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> - memphys, memlength);
> - if (ret)
> - goto out_put_vdev;
> + if (nvdev->has_mig_hw_bug_fix) {
> + nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> + memphys, memlength);
> + } else {
> + ret = nvgrace_gpu_nvdev_struct_workaround(pdev, nvdev,
> + memphys,
> + memlength);
> + if (ret)
> + goto out_put_vdev;
> + }
Doesn't this work out much more naturally if we just do something like:
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c
b/drivers/vfio/pci/nvgrace-gpu/main.c index 85eacafaffdf..43a9457442ff
100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -17,9 +17,6 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
-/* Memory size expected as non cached and reserved by the VM driver */
-#define RESMEM_SIZE SZ_1G
-
/* A hardwired and constant ABI value between the GPU FW and VFIO
driver. */ #define MEMBLK_SIZE SZ_512M
@@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index,
if (index == USEMEM_REGION_INDEX)
return &nvdev->usemem;
- if (index == RESMEM_REGION_INDEX)
+ if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
return &nvdev->resmem;
return NULL;
@@ -757,6 +754,13 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
u64 memphys, u64 memlength)
{
int ret = 0;
+ u64 resmem_size = 0;
+
+ /*
+ * Comment about the GH bug that requires this and fix in GB
+ */
+ if (!nvdev->has_mig_hw_bug_fix)
+ resmem_size = SZ_1G;
/*
* The VM GPU device driver needs a non-cacheable region to
support @@ -780,7 +784,7 @@ nvgrace_gpu_init_nvdev_struct(struct
pci_dev *pdev,
* memory (usemem) is added to the kernel for usage by the VM
* workloads. Make the usable memory size memblock aligned.
*/
- if (check_sub_overflow(memlength, RESMEM_SIZE,
+ if (check_sub_overflow(memlength, resmem_size,
&nvdev->usemem.memlength)) {
ret = -EOVERFLOW;
goto done;
@@ -813,7 +817,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
* the BAR size for them.
*/
nvdev->usemem.bar_size =
roundup_pow_of_two(nvdev->usemem.memlength);
- nvdev->resmem.bar_size =
roundup_pow_of_two(nvdev->resmem.memlength);
+ if (nvdev->resmem.memlength)
+ nvdev->resmem.bar_size =
+ roundup_pow_of_two(nvdev->resmem.memlength);
done:
return ret;
}
Thanks,
Alex
Powered by blists - more mailing lists