[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250916-luo-pci-v2-3-c494053c3c08@kernel.org>
Date: Tue, 16 Sep 2025 00:45:11 -0700
From: Chris Li <chrisl@...nel.org>
To: Bjorn Helgaas <bhelgaas@...gle.com>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
"Rafael J. Wysocki" <rafael@...nel.org>, Danilo Krummrich <dakr@...nel.org>,
Len Brown <lenb@...nel.org>, Pasha Tatashin <pasha.tatashin@...een.com>
Cc: linux-kernel@...r.kernel.org, linux-pci@...r.kernel.org,
linux-acpi@...r.kernel.org, David Matlack <dmatlack@...gle.com>,
Pasha Tatashin <tatashin@...gle.com>, Jason Miu <jasonmiu@...gle.com>,
Vipin Sharma <vipinsh@...gle.com>, Saeed Mahameed <saeedm@...dia.com>,
Adithya Jayachandran <ajayachandra@...dia.com>,
Parav Pandit <parav@...dia.com>, William Tu <witu@...dia.com>,
Mike Rapoport <rppt@...nel.org>, Chris Li <chrisl@...nel.org>,
Jason Gunthorpe <jgg@...pe.ca>, Leon Romanovsky <leon@...nel.org>
Subject: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel()
callbacks to driver
After the list of preserved devices is constructed, the PCI subsystem can
now forward the liveupdate request to the driver.
The PCI subsystem saves and restores a u64 data from LUO callback. For
each device, the PCI subsystem preserve a "dev_state" struct, which
contains the path (domain + bus + devfn) and a per device u64 data.
The device driver will use such a u64 data area to store the device driver
state. The device live update callback looks very similar to the LUO
subsystem callback, with the "void *arg" change to "struct device *dev".
In the prepare callback, the PCI subsystem allocates then preserves a
folio big enough to hold all requested device state (struct pci_dev_ser)
in an array and the count.
The PCI sub system will just forward the liveupdate call back with u64
data point to the u64 field of the device state array.
If some device fails the prepare callback, all previous devices that
already successfully finished the prepare call back will get the cancel
call back to clean up the saved state. That clean up is the special case
that not the full list will be walked.
In other live update callbacks, all the devices in the preserved device
list will get the callback with their own u64 data field.
Signed-off-by: Chris Li <chrisl@...nel.org>
---
drivers/pci/liveupdate.c | 203 +++++++++++++++++++++++++++++++++++++++--
include/linux/dev_liveupdate.h | 23 +++++
include/linux/device/driver.h | 6 ++
3 files changed, 223 insertions(+), 9 deletions(-)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index e8891844b8194dabf8d1e8e2d74d9c701bd741ca..2b215c224fb78c908579b0d22be713e1dc7ca21f 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -9,11 +9,25 @@
#define dev_fmt(fmt) "PCI liveupdate: " fmt
#include <linux/types.h>
+#include <linux/kexec_handover.h>
#include <linux/liveupdate.h>
#include "pci.h"
#define PCI_SUBSYSTEM_NAME "pci"
+static LIST_HEAD(preserved_devices);
+
+struct pci_dev_ser {
+ u32 path; /* domain + bus + slot + fn */
+ u32 flags;
+ u64 driver_data; /* driver data */
+};
+
+struct pci_ser {
+ u32 count;
+ struct pci_dev_ser devs[];
+};
+
static void stack_push_buses(struct list_head *stack, struct list_head *buses)
{
struct pci_bus *bus;
@@ -70,42 +84,213 @@ static int build_liveupdate_devices(struct list_head *head)
return count;
}
+static void dev_cleanup_liveupdate(struct device *dev)
+{
+ dev->lu.flags &= ~LU_DEPENDED;
+ list_del_init(&dev->lu.lu_next);
+}
+
static void cleanup_liveupdate_devices(struct list_head *head)
{
struct device *d, *n;
- list_for_each_entry_safe(d, n, head, lu.lu_next) {
- d->lu.flags &= ~LU_DEPENDED;
- list_del_init(&d->lu.lu_next);
+ list_for_each_entry_safe(d, n, head, lu.lu_next)
+ dev_cleanup_liveupdate(d);
+}
+
+static void cleanup_liveupdate_state(struct pci_ser *pci_state)
+{
+ struct folio *folio = virt_to_folio(pci_state);
+
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
+}
+
+static void pci_call_cancel(struct pci_ser *pci_state)
+{
+ struct pci_dev_ser *si = pci_state->devs;
+ struct device *dev, *next;
+
+ list_for_each_entry_safe(dev, next, &preserved_devices, lu.lu_next) {
+ struct pci_dev_ser *s = si++;
+
+ if (!dev->driver)
+ panic("PCI liveupdate cancel: %s has no driver", dev_name(dev));
+ if (!dev->driver->lu)
+ panic("PCI liveupdate cancel: %s driver %s does not support liveupdate",
+ dev_name(dev), dev->driver->name ? : "(null name)");
+ if (dev->driver->lu->cancel)
+ dev->driver->lu->cancel(dev, s->driver_data);
+ dev_cleanup_liveupdate(dev);
}
}
-static int pci_liveupdate_prepare(void *arg, u64 *data)
+static int pci_get_device_path(struct pci_dev *pdev)
+{
+ return (pci_domain_nr(pdev->bus) << 16) | pci_dev_id(pdev);
+}
+
+static int pci_save_device_state(struct device *dev, struct pci_dev_ser *s)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ s->path = pci_get_device_path(pdev);
+ s->flags = dev->lu.flags;
+ return 0;
+}
+
+static int pci_call_prepare(struct pci_ser *pci_state,
+ struct list_head *devices)
+{
+ struct pci_dev_ser *pdev_state_current = pci_state->devs;
+ struct device *dev, *next;
+ int ret;
+ char *reason;
+
+ list_for_each_entry_safe(dev, next, devices, lu.lu_next) {
+ struct pci_dev_ser *s = pdev_state_current++;
+
+ if (!dev->driver) {
+ reason = "no driver";
+ ret = -ENOENT;
+ goto cancel;
+ }
+ if (!dev->driver->lu) {
+ reason = "driver does not support liveupdate";
+ ret = -EPERM;
+ goto cancel;
+ }
+ ret = pci_save_device_state(dev, s);
+ if (ret) {
+ reason = "save device state failed";
+ goto cancel;
+ }
+ if (dev->driver->lu->prepare) {
+ ret = dev->driver->lu->prepare(dev, &s->driver_data);
+ if (ret) {
+ reason = "prepare() failed";
+ goto cancel;
+ }
+ }
+ list_move_tail(&dev->lu.lu_next, &preserved_devices);
+ }
+ return 0;
+
+cancel:
+ dev_err(dev, "luo prepare failed %d (%s)\n", ret, reason);
+ pci_call_cancel(pci_state);
+ return ret;
+}
+
+static int __pci_liveupdate_prepare(void *arg, u64 *data)
{
LIST_HEAD(requested_devices);
+ struct pci_ser *pci_state;
+ int ret;
+ int count = build_liveupdate_devices(&requested_devices);
+ int size = sizeof(*pci_state) + sizeof(pci_state->devs[0]) * count;
+ int order = get_order(size);
+ struct folio *folio;
- pr_info("prepare data[%llx]\n", *data);
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto cleanup_device;
+ }
- pci_lock_rescan_remove();
- down_write(&pci_bus_sem);
+ pci_state = folio_address(folio);
+ pci_state->count = count;
+
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ pr_err("liveupdate_preserve_folio failed\n");
+ goto release_folio;
+ }
+
+ ret = pci_call_prepare(pci_state, &requested_devices);
+ if (ret)
+ goto unpreserve;
- build_liveupdate_devices(&requested_devices);
+ *data = __pa(pci_state);
+ pr_info("prepare data[%llx]\n", *data);
+ return 0;
+
+unpreserve:
+ kho_unpreserve_folio(folio);
+release_folio:
+ folio_put(folio);
+cleanup_device:
cleanup_liveupdate_devices(&requested_devices);
+ return ret;
+}
+static int pci_liveupdate_prepare(void *arg, u64 *data)
+{
+ int ret;
+
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+ ret = __pci_liveupdate_prepare(arg, data);
up_write(&pci_bus_sem);
pci_unlock_rescan_remove();
+ return ret;
+}
+
+static int pci_call_freeze(struct pci_ser *pci_state, struct list_head *devlist)
+{
+ struct pci_dev_ser *n = pci_state->devs;
+ struct device *dev;
+ int ret = 0;
+
+ list_for_each_entry(dev, devlist, lu.lu_next) {
+ struct pci_dev_ser *s = n++;
+
+ if (!dev->driver) {
+ if (!dev->parent)
+ continue;
+ panic("PCI liveupdate freeze: %s has no driver", dev_name(dev));
+ }
+ if (!dev->driver->lu->freeze)
+ continue;
+ ret = dev->driver->lu->freeze(dev, &s->driver_data);
+ if (ret) {
+ dev_err(dev, "luo freeze failed %d\n", ret);
+ pci_call_cancel(pci_state);
+ return ret;
+ }
+ }
return 0;
}
static int pci_liveupdate_freeze(void *arg, u64 *data)
{
+ struct pci_ser *pci_state = phys_to_virt(*data);
+ int ret;
+
pr_info("freeze data[%llx]\n", *data);
- return 0;
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+
+ ret = pci_call_freeze(pci_state, &preserved_devices);
+
+ up_write(&pci_bus_sem);
+ pci_unlock_rescan_remove();
+ return ret;
}
static void pci_liveupdate_cancel(void *arg, u64 data)
{
+ struct pci_ser *pci_state = phys_to_virt(data);
+
pr_info("cancel data[%llx]\n", data);
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+
+ pci_call_cancel(pci_state);
+ cleanup_liveupdate_state(pci_state);
+
+ up_write(&pci_bus_sem);
+ pci_unlock_rescan_remove();
}
static void pci_liveupdate_finish(void *arg, u64 data)
diff --git a/include/linux/dev_liveupdate.h b/include/linux/dev_liveupdate.h
index 72297cba08a999e89f7bc0997dabdbe14e0aa12c..80a723c7701ac4ddc2ddd03d0ffc9cc5a62a6083 100644
--- a/include/linux/dev_liveupdate.h
+++ b/include/linux/dev_liveupdate.h
@@ -20,6 +20,8 @@ enum liveupdate_flag {
#define LU_REQUESTED (LU_BUSMASTER)
#define LU_DEPENDED (LU_BUSMASTER_BRIDGE)
+struct device;
+
/**
* struct dev_liveupdate - Device state for live update operations
* @lu_next: List head for linking the device into live update
@@ -40,5 +42,26 @@ struct dev_liveupdate {
bool visited:1;
};
+/**
+ * struct dev_liveupdate_ops - Live Update callback functions
+ * @prepare: Prepare device for the upcoming state transition. Driver and
+ * buses should save the necessary device state.
+ * @freeze: A final notification before the system jumps to the new kernel.
+ * Called from reboot() syscall.
+ * @cancel: Cancel the live update process. Driver should clean
+ * up any saved state if necessary.
+ * @finish: The system has completed a transition. Drivers and buses should
+ * have already restored the previously saved device state.
+ * Clean-up any saved state or reset unreclaimed device.
+ *
+ * This structure is used by drivers and buses to hold the callback from LUO.
+ */
+struct dev_liveupdate_ops {
+ int (*prepare)(struct device *dev, u64 *data);
+ int (*freeze)(struct device *dev, u64 *data);
+ void (*cancel)(struct device *dev, u64 data);
+ void (*finish)(struct device *dev, u64 data);
+};
+
#endif /* CONFIG_LIVEUPDATE */
#endif /* _LINUX_DEV_LIVEUPDATE_H */
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index cd8e0f0a634be9ea63ff22e89d66ada3b1a9eaf2..b2ba469cc3065a412f02230c62e811af19c4d2c6 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -19,6 +19,7 @@
#include <linux/pm.h>
#include <linux/device/bus.h>
#include <linux/module.h>
+#include <linux/dev_liveupdate.h>
/**
* enum probe_type - device driver probe type to try
@@ -80,6 +81,8 @@ enum probe_type {
* it is bound to the driver.
* @pm: Power management operations of the device which matched
* this driver.
+ * @lu: Live update callbacks, notify device of the live
+ * update state, and allow preserve device across reboot.
* @coredump: Called when sysfs entry is written to. The device driver
* is expected to call the dev_coredump API resulting in a
* uevent.
@@ -116,6 +119,9 @@ struct device_driver {
const struct attribute_group **dev_groups;
const struct dev_pm_ops *pm;
+#ifdef CONFIG_LIVEUPDATE
+ const struct dev_liveupdate_ops *lu;
+#endif
void (*coredump) (struct device *dev);
struct driver_private *p;
--
2.51.0.384.g4c02a37b29-goog
Powered by blists - more mailing lists