lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904040828.319452-12-ankita@nvidia.com>
Date: Thu, 4 Sep 2025 04:08:25 +0000
From: <ankita@...dia.com>
To: <ankita@...dia.com>, <jgg@...dia.com>, <alex.williamson@...hat.com>,
	<yishaih@...dia.com>, <skolothumtho@...dia.com>, <kevin.tian@...el.com>,
	<yi.l.liu@...el.com>, <zhiw@...dia.com>
CC: <aniketa@...dia.com>, <cjia@...dia.com>, <kwankhede@...dia.com>,
	<targupta@...dia.com>, <vsethi@...dia.com>, <acurrid@...dia.com>,
	<apopple@...dia.com>, <jhubbard@...dia.com>, <danw@...dia.com>,
	<anuaggarwal@...dia.com>, <mochs@...dia.com>, <kjaju@...dia.com>,
	<dnigam@...dia.com>, <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: [RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list

From: Ankit Agrawal <ankita@...dia.com>

It is possible for some system memory pages on the EGM to
have retired pages with uncorrectable ECC errors. A list of
pages known with such errors (referred as retired pages) are
maintained by the Host UEFI. The Host UEFI populates such list
in a reserved region. It communicates the SPA of this region
through a ACPI DSDT property.

nvgrace-egm module is responsible to store the list of retired page
offsets to be made available for usermode processes. The module:
1. Get the reserved memory region SPA and maps to it to fetch
the list of bad pages.
2. Calculate the retired page offsets in the EGM and stores it.

Signed-off-by: Ankit Agrawal <ankita@...dia.com>
---
 drivers/vfio/pci/nvgrace-gpu/egm.c     | 81 ++++++++++++++++++++++++++
 drivers/vfio/pci/nvgrace-gpu/egm_dev.c | 32 ++++++++--
 drivers/vfio/pci/nvgrace-gpu/egm_dev.h |  5 +-
 drivers/vfio/pci/nvgrace-gpu/main.c    |  8 ++-
 include/linux/nvgrace-egm.h            |  2 +
 5 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index bf1241ed1d60..7a026b4d98f7 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -8,6 +8,11 @@
 
 #define MAX_EGM_NODES 4
 
+struct h_node {
+	unsigned long mem_offset;
+	struct hlist_node node;
+};
+
 static dev_t dev;
 static struct class *class;
 static DEFINE_XARRAY(egm_chardevs);
@@ -16,6 +21,7 @@ struct chardev {
 	struct device device;
 	struct cdev cdev;
 	atomic_t open_count;
+	DECLARE_HASHTABLE(htbl, 0x10);
 };
 
 static struct nvgrace_egm_dev *
@@ -145,20 +151,86 @@ static void del_egm_chardev(struct chardev *egm_chardev)
 	put_device(&egm_chardev->device);
 }
 
+static void cleanup_retired_pages(struct chardev *egm_chardev)
+{
+	struct h_node *cur_page;
+	unsigned long bkt;
+	struct hlist_node *temp_node;
+
+	hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+		hash_del(&cur_page->node);
+		kvfree(cur_page);
+	}
+}
+
+static int nvgrace_egm_fetch_retired_pages(struct nvgrace_egm_dev *egm_dev,
+					   struct chardev *egm_chardev)
+{
+	u64 count;
+	void *memaddr;
+	int index, ret = 0;
+
+	memaddr = memremap(egm_dev->retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
+	if (!memaddr)
+		return -ENOMEM;
+
+	count = *(u64 *)memaddr;
+
+	for (index = 0; index < count; index++) {
+		struct h_node *retired_page;
+
+		/*
+		 * Since the EGM is linearly mapped, the offset in the
+		 * carveout is the same offset in the VM system memory.
+		 *
+		 * Calculate the offset to communicate to the usermode
+		 * apps.
+		 */
+		retired_page = kvzalloc(sizeof(*retired_page), GFP_KERNEL);
+		if (!retired_page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
+					   egm_dev->egmphys;
+		hash_add(egm_chardev->htbl, &retired_page->node,
+			 retired_page->mem_offset);
+	}
+
+	memunmap(memaddr);
+
+	if (ret)
+		cleanup_retired_pages(egm_chardev);
+
+	return ret;
+}
+
 static int egm_driver_probe(struct auxiliary_device *aux_dev,
 			    const struct auxiliary_device_id *id)
 {
 	struct nvgrace_egm_dev *egm_dev =
 		container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
 	struct chardev *egm_chardev;
+	int ret;
 
 	egm_chardev = setup_egm_chardev(egm_dev);
 	if (!egm_chardev)
 		return -EINVAL;
 
+	hash_init(egm_chardev->htbl);
+
+	ret = nvgrace_egm_fetch_retired_pages(egm_dev, egm_chardev);
+	if (ret)
+		goto error_exit;
+
 	xa_store(&egm_chardevs, egm_dev->egmpxm, egm_chardev, GFP_KERNEL);
 
 	return 0;
+
+error_exit:
+	del_egm_chardev(egm_chardev);
+	return ret;
 }
 
 static void egm_driver_remove(struct auxiliary_device *aux_dev)
@@ -166,10 +238,19 @@ static void egm_driver_remove(struct auxiliary_device *aux_dev)
 	struct nvgrace_egm_dev *egm_dev =
 		container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
 	struct chardev *egm_chardev = xa_erase(&egm_chardevs, egm_dev->egmpxm);
+	struct h_node *cur_page;
+	unsigned long bkt;
+	struct hlist_node *temp_node;
 
 	if (!egm_chardev)
 		return;
 
+	hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+		hash_del(&cur_page->node);
+		kvfree(cur_page);
+	}
+
+	cleanup_retired_pages(egm_chardev);
 	del_egm_chardev(egm_chardev);
 }
 
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
index ca50bc1f67a0..b8e143542bce 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
@@ -18,22 +18,41 @@ int nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
 }
 
 int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
-				   u64 *pegmlength)
+				   u64 *pegmlength, u64 *pretiredpagesphys)
 {
 	int ret;
 
 	/*
-	 * The memory information is present in the system ACPI tables as DSD
-	 * properties nvidia,egm-base-pa and nvidia,egm-size.
+	 * The EGM memory information is present in the system ACPI tables
+	 * as DSD properties nvidia,egm-base-pa and nvidia,egm-size.
 	 */
 	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
 				       pegmlength);
 	if (ret)
-		return ret;
+		goto error_exit;
 
 	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
 				       pegmphys);
+	if (ret)
+		goto error_exit;
+
+	/*
+	 * SBIOS puts the list of retired pages on a region. The region
+	 * SPA is exposed as "nvidia,egm-retired-pages-data-base".
+	 */
+	ret = device_property_read_u64(&pdev->dev,
+				       "nvidia,egm-retired-pages-data-base",
+				       pretiredpagesphys);
+	if (ret)
+		goto error_exit;
+
+	/* Catch firmware bug and avoid a crash */
+	if (*pretiredpagesphys == 0) {
+		dev_err(&pdev->dev, "Retired pages region is not setup\n");
+		ret = -EINVAL;
+	}
 
+error_exit:
 	return ret;
 }
 
@@ -74,7 +93,8 @@ static void nvgrace_gpu_release_aux_device(struct device *device)
 
 struct nvgrace_egm_dev *
 nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
-			      u64 egmphys, u64 egmlength, u64 egmpxm)
+			      u64 egmphys, u64 egmlength, u64 egmpxm,
+			      u64 retiredpagesphys)
 {
 	struct nvgrace_egm_dev *egm_dev;
 	int ret;
@@ -86,6 +106,8 @@ nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
 	egm_dev->egmpxm = egmpxm;
 	egm_dev->egmphys = egmphys;
 	egm_dev->egmlength = egmlength;
+	egm_dev->retiredpagesphys = retiredpagesphys;
+
 	INIT_LIST_HEAD(&egm_dev->gpus);
 
 	egm_dev->aux_dev.id = egmpxm;
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
index 2e1612445898..2f329a05685d 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
@@ -16,8 +16,9 @@ void remove_gpu(struct nvgrace_egm_dev *egm_dev, struct pci_dev *pdev);
 
 struct nvgrace_egm_dev *
 nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
-			      u64 egmphys, u64 egmlength, u64 egmpxm);
+			      u64 egmphys, u64 egmlength, u64 egmpxm,
+			      u64 retiredpagesphys);
 
 int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
-				   u64 *pegmlength);
+				   u64 *pegmlength, u64 *pretiredpagesphys);
 #endif /* EGM_DEV_H */
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b1ccd1ac2e0a..534dc3ee6113 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -67,7 +67,7 @@ static struct list_head egm_dev_list;
 static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 {
 	struct nvgrace_egm_dev_entry *egm_entry = NULL;
-	u64 egmphys, egmlength, egmpxm;
+	u64 egmphys, egmlength, egmpxm, retiredpagesphys;
 	int ret = 0;
 	bool is_new_region = false;
 
@@ -80,7 +80,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 	if (nvgrace_gpu_has_egm_property(pdev, &egmpxm))
 		goto exit;
 
-	ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength);
+	ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength,
+					     &retiredpagesphys);
 	if (ret)
 		goto exit;
 
@@ -103,7 +104,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 
 	egm_entry->egm_dev =
 		nvgrace_gpu_create_aux_device(pdev, NVGRACE_EGM_DEV_NAME,
-					      egmphys, egmlength, egmpxm);
+					      egmphys, egmlength, egmpxm,
+					      retiredpagesphys);
 	if (!egm_entry->egm_dev) {
 		ret = -EINVAL;
 		goto free_egm_entry;
diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h
index a66906753267..197255c2a3b7 100644
--- a/include/linux/nvgrace-egm.h
+++ b/include/linux/nvgrace-egm.h
@@ -7,6 +7,7 @@
 #define NVGRACE_EGM_H
 
 #include <linux/auxiliary_bus.h>
+#include <linux/hashtable.h>
 
 #define NVGRACE_EGM_DEV_NAME "egm"
 
@@ -19,6 +20,7 @@ struct nvgrace_egm_dev {
 	struct auxiliary_device aux_dev;
 	phys_addr_t egmphys;
 	size_t egmlength;
+	phys_addr_t retiredpagesphys;
 	u64 egmpxm;
 	struct list_head gpus;
 };
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ