[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904040828.319452-12-ankita@nvidia.com>
Date: Thu, 4 Sep 2025 04:08:25 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...dia.com>, <alex.williamson@...hat.com>,
<yishaih@...dia.com>, <skolothumtho@...dia.com>, <kevin.tian@...el.com>,
<yi.l.liu@...el.com>, <zhiw@...dia.com>
CC: <aniketa@...dia.com>, <cjia@...dia.com>, <kwankhede@...dia.com>,
<targupta@...dia.com>, <vsethi@...dia.com>, <acurrid@...dia.com>,
<apopple@...dia.com>, <jhubbard@...dia.com>, <danw@...dia.com>,
<anuaggarwal@...dia.com>, <mochs@...dia.com>, <kjaju@...dia.com>,
<dnigam@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: [RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list
From: Ankit Agrawal <ankita@...dia.com>
It is possible for some system memory pages on the EGM to
have retired pages with uncorrectable ECC errors. A list of
pages known with such errors (referred as retired pages) are
maintained by the Host UEFI. The Host UEFI populates such list
in a reserved region. It communicates the SPA of this region
through a ACPI DSDT property.
nvgrace-egm module is responsible to store the list of retired page
offsets to be made available for usermode processes. The module:
1. Get the reserved memory region SPA and maps to it to fetch
the list of bad pages.
2. Calculate the retired page offsets in the EGM and stores it.
Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
drivers/vfio/pci/nvgrace-gpu/egm.c | 81 ++++++++++++++++++++++++++
drivers/vfio/pci/nvgrace-gpu/egm_dev.c | 32 ++++++++--
drivers/vfio/pci/nvgrace-gpu/egm_dev.h | 5 +-
drivers/vfio/pci/nvgrace-gpu/main.c | 8 ++-
include/linux/nvgrace-egm.h | 2 +
5 files changed, 118 insertions(+), 10 deletions(-)
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index bf1241ed1d60..7a026b4d98f7 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -8,6 +8,11 @@
#define MAX_EGM_NODES 4
+struct h_node {
+ unsigned long mem_offset;
+ struct hlist_node node;
+};
+
static dev_t dev;
static struct class *class;
static DEFINE_XARRAY(egm_chardevs);
@@ -16,6 +21,7 @@ struct chardev {
struct device device;
struct cdev cdev;
atomic_t open_count;
+ DECLARE_HASHTABLE(htbl, 0x10);
};
static struct nvgrace_egm_dev *
@@ -145,20 +151,86 @@ static void del_egm_chardev(struct chardev *egm_chardev)
put_device(&egm_chardev->device);
}
+static void cleanup_retired_pages(struct chardev *egm_chardev)
+{
+ struct h_node *cur_page;
+ unsigned long bkt;
+ struct hlist_node *temp_node;
+
+ hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+ hash_del(&cur_page->node);
+ kvfree(cur_page);
+ }
+}
+
+static int nvgrace_egm_fetch_retired_pages(struct nvgrace_egm_dev *egm_dev,
+ struct chardev *egm_chardev)
+{
+ u64 count;
+ void *memaddr;
+ int index, ret = 0;
+
+ memaddr = memremap(egm_dev->retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
+ if (!memaddr)
+ return -ENOMEM;
+
+ count = *(u64 *)memaddr;
+
+ for (index = 0; index < count; index++) {
+ struct h_node *retired_page;
+
+ /*
+ * Since the EGM is linearly mapped, the offset in the
+ * carveout is the same offset in the VM system memory.
+ *
+ * Calculate the offset to communicate to the usermode
+ * apps.
+ */
+ retired_page = kvzalloc(sizeof(*retired_page), GFP_KERNEL);
+ if (!retired_page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
+ egm_dev->egmphys;
+ hash_add(egm_chardev->htbl, &retired_page->node,
+ retired_page->mem_offset);
+ }
+
+ memunmap(memaddr);
+
+ if (ret)
+ cleanup_retired_pages(egm_chardev);
+
+ return ret;
+}
+
static int egm_driver_probe(struct auxiliary_device *aux_dev,
const struct auxiliary_device_id *id)
{
struct nvgrace_egm_dev *egm_dev =
container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
struct chardev *egm_chardev;
+ int ret;
egm_chardev = setup_egm_chardev(egm_dev);
if (!egm_chardev)
return -EINVAL;
+ hash_init(egm_chardev->htbl);
+
+ ret = nvgrace_egm_fetch_retired_pages(egm_dev, egm_chardev);
+ if (ret)
+ goto error_exit;
+
xa_store(&egm_chardevs, egm_dev->egmpxm, egm_chardev, GFP_KERNEL);
return 0;
+
+error_exit:
+ del_egm_chardev(egm_chardev);
+ return ret;
}
static void egm_driver_remove(struct auxiliary_device *aux_dev)
@@ -166,10 +238,19 @@ static void egm_driver_remove(struct auxiliary_device *aux_dev)
struct nvgrace_egm_dev *egm_dev =
container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
struct chardev *egm_chardev = xa_erase(&egm_chardevs, egm_dev->egmpxm);
+ struct h_node *cur_page;
+ unsigned long bkt;
+ struct hlist_node *temp_node;
if (!egm_chardev)
return;
+ hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+ hash_del(&cur_page->node);
+ kvfree(cur_page);
+ }
+
+ cleanup_retired_pages(egm_chardev);
del_egm_chardev(egm_chardev);
}
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
index ca50bc1f67a0..b8e143542bce 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
@@ -18,22 +18,41 @@ int nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
}
int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
- u64 *pegmlength)
+ u64 *pegmlength, u64 *pretiredpagesphys)
{
int ret;
/*
- * The memory information is present in the system ACPI tables as DSD
- * properties nvidia,egm-base-pa and nvidia,egm-size.
+ * The EGM memory information is present in the system ACPI tables
+ * as DSD properties nvidia,egm-base-pa and nvidia,egm-size.
*/
ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
pegmlength);
if (ret)
- return ret;
+ goto error_exit;
ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
pegmphys);
+ if (ret)
+ goto error_exit;
+
+ /*
+ * SBIOS puts the list of retired pages on a region. The region
+ * SPA is exposed as "nvidia,egm-retired-pages-data-base".
+ */
+ ret = device_property_read_u64(&pdev->dev,
+ "nvidia,egm-retired-pages-data-base",
+ pretiredpagesphys);
+ if (ret)
+ goto error_exit;
+
+ /* Catch firmware bug and avoid a crash */
+ if (*pretiredpagesphys == 0) {
+ dev_err(&pdev->dev, "Retired pages region is not setup\n");
+ ret = -EINVAL;
+ }
+error_exit:
return ret;
}
@@ -74,7 +93,8 @@ static void nvgrace_gpu_release_aux_device(struct device *device)
struct nvgrace_egm_dev *
nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
- u64 egmphys, u64 egmlength, u64 egmpxm)
+ u64 egmphys, u64 egmlength, u64 egmpxm,
+ u64 retiredpagesphys)
{
struct nvgrace_egm_dev *egm_dev;
int ret;
@@ -86,6 +106,8 @@ nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
egm_dev->egmpxm = egmpxm;
egm_dev->egmphys = egmphys;
egm_dev->egmlength = egmlength;
+ egm_dev->retiredpagesphys = retiredpagesphys;
+
INIT_LIST_HEAD(&egm_dev->gpus);
egm_dev->aux_dev.id = egmpxm;
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
index 2e1612445898..2f329a05685d 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
@@ -16,8 +16,9 @@ void remove_gpu(struct nvgrace_egm_dev *egm_dev, struct pci_dev *pdev);
struct nvgrace_egm_dev *
nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
- u64 egmphys, u64 egmlength, u64 egmpxm);
+ u64 egmphys, u64 egmlength, u64 egmpxm,
+ u64 retiredpagesphys);
int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
- u64 *pegmlength);
+ u64 *pegmlength, u64 *pretiredpagesphys);
#endif /* EGM_DEV_H */
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b1ccd1ac2e0a..534dc3ee6113 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -67,7 +67,7 @@ static struct list_head egm_dev_list;
static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
{
struct nvgrace_egm_dev_entry *egm_entry = NULL;
- u64 egmphys, egmlength, egmpxm;
+ u64 egmphys, egmlength, egmpxm, retiredpagesphys;
int ret = 0;
bool is_new_region = false;
@@ -80,7 +80,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
if (nvgrace_gpu_has_egm_property(pdev, &egmpxm))
goto exit;
- ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength);
+ ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength,
+ &retiredpagesphys);
if (ret)
goto exit;
@@ -103,7 +104,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
egm_entry->egm_dev =
nvgrace_gpu_create_aux_device(pdev, NVGRACE_EGM_DEV_NAME,
- egmphys, egmlength, egmpxm);
+ egmphys, egmlength, egmpxm,
+ retiredpagesphys);
if (!egm_entry->egm_dev) {
ret = -EINVAL;
goto free_egm_entry;
diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h
index a66906753267..197255c2a3b7 100644
--- a/include/linux/nvgrace-egm.h
+++ b/include/linux/nvgrace-egm.h
@@ -7,6 +7,7 @@
#define NVGRACE_EGM_H
#include <linux/auxiliary_bus.h>
+#include <linux/hashtable.h>
#define NVGRACE_EGM_DEV_NAME "egm"
@@ -19,6 +20,7 @@ struct nvgrace_egm_dev {
struct auxiliary_device aux_dev;
phys_addr_t egmphys;
size_t egmlength;
+ phys_addr_t retiredpagesphys;
u64 egmpxm;
struct list_head gpus;
};
--
2.34.1
Powered by blists - more mailing lists