[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251006120944.7880-4-spasswolf@web.de>
Date: Mon, 6 Oct 2025 14:09:42 +0200
From: Bert Karwatzki <spasswolf@....de>
To: linux-kernel@...r.kernel.org
Cc: Bert Karwatzki <spasswolf@....de>,
linux-next@...r.kernel.org,
linux-stable@...r.kernel.org,
regressions@...ts.linux.dev,
linux-pci@...r.kernel.org,
linux-acpi@...r.kernel.org,
Mario Limonciello <superm1@...nel.org>,
Christian König <christian.koenig@....com>,
"Rafael J . Wysocki" <rafael.j.wysocki@...el.com>
Subject: [REGRESSION 03/04] Crash during resume of pcie bridge
In order to get a working crash I removed some of the monitoring again:
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index bc365c0dbe2f..a984ccd4a2a0 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -514,60 +514,46 @@ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
switch (type) {
case ACPI_NOTIFY_BUS_CHECK:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_BUS_CHECK\n", __func__, __LINE__);
acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n");
break;
case ACPI_NOTIFY_DEVICE_CHECK:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_DEVICE_CHECK\n", __func__, __LINE__);
acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n");
break;
case ACPI_NOTIFY_DEVICE_WAKE:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_DEVICE_WAKE\n", __func__, __LINE__);
acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n");
return;
case ACPI_NOTIFY_EJECT_REQUEST:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_EJECT_REQUEST\n", __func__, __LINE__);
acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n");
break;
case ACPI_NOTIFY_DEVICE_CHECK_LIGHT:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_DEVICE_CHECK_LIGHT\n", __func__, __LINE__);
acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n");
/* TBD: Exactly what does 'light' mean? */
return;
case ACPI_NOTIFY_FREQUENCY_MISMATCH:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_FREQUENCY_MISMATCH\n", __func__, __LINE__);
acpi_handle_err(handle, "Device cannot be configured due "
"to a frequency mismatch\n");
return;
case ACPI_NOTIFY_BUS_MODE_MISMATCH:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_BUS_MODE_MISMATCH\n", __func__, __LINE__);
acpi_handle_err(handle, "Device cannot be configured due "
"to a bus mode mismatch\n");
return;
case ACPI_NOTIFY_POWER_FAULT:
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_POWER_FAULT\n", __func__, __LINE__);
acpi_handle_err(handle, "Device has suffered a power fault\n");
return;
default:
- printk(KERN_INFO "%s %d: acpi unknown event type\n", __func__, __LINE__);
acpi_handle_debug(handle, "Unknown event type 0x%x\n", type);
return;
}
adev = acpi_get_acpi_dev(handle);
- if (adev)
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
- else
- printk(KERN_INFO "%s %d: adev = NULL\n", __func__, __LINE__);
-
if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type)))
return;
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 9a7dc432b50d..4e0583274b8f 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -539,7 +539,6 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
if (!adev)
return;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
mutex_lock(&acpi_pm_notifier_lock);
if (adev->wakeup.flags.notifier_present) {
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 0f6a16856119..5ff343096ece 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1167,7 +1167,6 @@ void acpi_os_wait_events_complete(void)
* Make sure the GPE handler or the fixed event handler is not used
* on another CPU after removal.
*/
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
if (acpi_sci_irq_valid())
synchronize_hardirq(acpi_sci_irq);
flush_workqueue(kacpid_wq);
@@ -1185,7 +1184,6 @@ static void acpi_hotplug_work_fn(struct work_struct *work)
{
struct acpi_hp_work *hpw = container_of(work, struct acpi_hp_work, work);
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
acpi_os_wait_events_complete();
acpi_device_hotplug(hpw->adev, hpw->src);
kfree(hpw);
@@ -1194,7 +1192,6 @@ static void acpi_hotplug_work_fn(struct work_struct *work)
acpi_status acpi_hotplug_schedule(struct acpi_device *adev, u32 src)
{
struct acpi_hp_work *hpw;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
acpi_handle_debug(adev->handle,
"Scheduling hotplug event %u for deferred handling\n",
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d53be7e0388d..065abe56f440 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -251,7 +251,6 @@ static int acpi_scan_check_and_detach(struct acpi_device *adev, void *p)
{
struct acpi_scan_handler *handler = adev->handler;
uintptr_t flags = (uintptr_t)p;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
acpi_dev_for_each_child_reverse(adev, acpi_scan_check_and_detach, p);
@@ -315,7 +314,6 @@ static void acpi_scan_check_subtree(struct acpi_device *adev)
{
uintptr_t flags = ACPI_SCAN_CHECK_FLAG_STATUS;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
acpi_scan_check_and_detach(adev, (void *)flags);
}
@@ -371,7 +369,6 @@ static int acpi_scan_rescan_bus(struct acpi_device *adev)
{
struct acpi_scan_handler *handler = adev->handler;
int ret;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
if (handler && handler->hotplug.scan_dependent)
ret = handler->hotplug.scan_dependent(adev);
@@ -388,7 +385,6 @@ static int acpi_scan_device_check(struct acpi_device *adev)
{
struct acpi_device *parent;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
acpi_scan_check_subtree(adev);
if (!acpi_device_is_present(adev))
@@ -416,24 +412,19 @@ static int acpi_scan_device_check(struct acpi_device *adev)
static int acpi_scan_bus_check(struct acpi_device *adev)
{
acpi_scan_check_subtree(adev);
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
return acpi_scan_rescan_bus(adev);
}
static int acpi_generic_hotplug_event(struct acpi_device *adev, u32 type)
{
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
switch (type) {
case ACPI_NOTIFY_BUS_CHECK:
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
return acpi_scan_bus_check(adev);
case ACPI_NOTIFY_DEVICE_CHECK:
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
return acpi_scan_device_check(adev);
case ACPI_NOTIFY_EJECT_REQUEST:
case ACPI_OST_EC_OSPM_EJECT:
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
if (adev->handler && !adev->handler->hotplug.enabled) {
dev_info(&adev->dev, "Eject disabled\n");
return -EPERM;
@@ -450,7 +441,6 @@ void acpi_device_hotplug(struct acpi_device *adev, u32 src)
u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE;
int error = -ENODEV;
- dev_info(&adev->dev, "%s %d\n", __func__, __LINE__);
lock_device_hotplug();
mutex_lock(&acpi_scan_lock);
@@ -476,10 +466,9 @@ void acpi_device_hotplug(struct acpi_device *adev, u32 src)
* There may be additional notify handlers for device objects
* without the .event() callback, so ignore them here.
*/
- if (notify) {
- dev_info(&adev->dev, "%s %d: calling notify = %px\n", __func__, __LINE__, (void *) notify);
+ if (notify)
error = notify(adev, src);
- } else
+ else
goto out;
}
switch (error) {
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 895898c3cd56..27cce7f1b1d3 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -142,6 +142,8 @@ EXPORT_SYMBOL_GPL(pm_runtime_suspended_time);
*/
static void pm_runtime_deactivate_timer(struct device *dev)
{
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.timer_expires > 0) {
hrtimer_try_to_cancel(&dev->power.suspend_timer);
dev->power.timer_expires = 0;
@@ -787,8 +789,6 @@ static int rpm_resume(struct device *dev, int rpmflags)
struct device *parent = NULL;
int retval = 0;
- if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
trace_rpm_resume(dev, rpmflags);
repeat:
@@ -815,7 +815,7 @@ static int rpm_resume(struct device *dev, int rpmflags)
* future.
*/
if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
+ dev_info(dev, "%s %d dev = %px\n", __func__, __LINE__, dev);
dev->power.request = RPM_REQ_NONE;
if (!dev->power.timer_autosuspends)
pm_runtime_deactivate_timer(dev);
@@ -1231,22 +1231,16 @@ int __pm_runtime_resume(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
- if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
dev->power.runtime_status != RPM_ACTIVE);
- if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
if (rpmflags & RPM_GET_PUT)
atomic_inc(&dev->power.usage_count);
- if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_lock_irqsave(&dev->power.lock, flags);
if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s %d\n", __func__, __LINE__);
+ dev_info(dev, "%s %d dev = %px\n", __func__, __LINE__, dev);
retval = rpm_resume(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index e56ab308da20..e21255b97251 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -484,7 +484,6 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
struct pci_dev *dev;
struct pci_bus *bus = slot->bus;
struct acpiphp_func *func;
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
if (bridge && bus->self && hotplug_is_native(bus->self)) {
/*
@@ -495,14 +494,11 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
* as a Thunderbolt host controller.
*/
for_each_pci_bridge(dev, bus) {
- dev_info(&dev->dev, "%s %d\n", __func__, __LINE__);
if (PCI_SLOT(dev->devfn) == slot->device) {
- dev_info(&dev->dev, "%s %d\n", __func__, __LINE__);
acpiphp_native_scan_bridge(dev);
}
}
} else {
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
LIST_HEAD(add_list);
int max, pass;
@@ -510,15 +506,12 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
max = acpiphp_max_busnr(bus);
for (pass = 0; pass < 2; pass++) {
for_each_pci_bridge(dev, bus) {
- dev_info(&dev->dev, "%s %d\n", __func__, __LINE__);
if (PCI_SLOT(dev->devfn) != slot->device) {
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
continue;
}
max = pci_scan_bridge(bus, dev, max, pass);
if (pass && dev->subordinate) {
- dev_info(&dev->dev, "%s %d\n", __func__, __LINE__);
check_hotplug_bridge(slot, dev);
pcibios_resource_survey_bus(dev->subordinate);
__pci_bus_size_bridges(dev->subordinate,
@@ -535,7 +528,6 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
list_for_each_entry(dev, &bus->devices, bus_list) {
/* Assume that newly added devices are powered on already. */
- dev_info(&dev->dev, "%s %d\n", __func__, __LINE__);
if (!pci_dev_is_added(dev))
dev->current_state = PCI_D0;
}
@@ -554,7 +546,6 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
}
pci_dev_put(dev);
}
- printk(KERN_INFO "%s %d\n", __func__, __LINE__);
}
/**
@@ -823,7 +814,6 @@ static void hotplug_event(u32 type, struct acpiphp_context *context)
switch (type) {
case ACPI_NOTIFY_BUS_CHECK:
/* bus re-enumerate */
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_BUS_CHECK\n", __func__, __LINE__);
acpi_handle_debug(handle, "Bus check in %s()\n", __func__);
if (bridge)
acpiphp_check_bridge(bridge);
@@ -834,7 +824,6 @@ static void hotplug_event(u32 type, struct acpiphp_context *context)
case ACPI_NOTIFY_DEVICE_CHECK:
/* device check */
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_DEVICE_CHECK\n", __func__, __LINE__);
acpi_handle_debug(handle, "Device check in %s()\n", __func__);
if (bridge) {
acpiphp_check_bridge(bridge);
@@ -850,23 +839,19 @@ static void hotplug_event(u32 type, struct acpiphp_context *context)
case ACPI_NOTIFY_EJECT_REQUEST:
/* request device eject */
- printk(KERN_INFO "%s %d: ACPI_NOTIFY_EJECT_REQUEST\n", __func__, __LINE__);
acpi_handle_debug(handle, "Eject request in %s()\n", __func__);
acpiphp_disable_and_eject_slot(slot);
break;
}
pci_unlock_rescan_remove();
- printk(KERN_INFO "%s %d:\n", __func__, __LINE__);
if (bridge)
put_bridge(bridge);
- printk(KERN_INFO "%s %d:\n", __func__, __LINE__);
}
static int acpiphp_hotplug_notify(struct acpi_device *adev, u32 type)
{
struct acpiphp_context *context;
- dev_info(&adev->dev, "%s %d: %s = %px\n", __func__, __LINE__, __func__, (void *) acpiphp_hotplug_notify);
context = acpiphp_grab_context(adev);
if (!context)
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0888b0d5ec73..d88d6b6ccf5b 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -508,8 +508,6 @@ static inline int pm_runtime_get(struct device *dev)
*/
static inline int pm_runtime_get_sync(struct device *dev)
{
- if (!strcmp(dev_name(dev), "0000:00:01.1"))
- dev_info(dev, "%s\n", __func__);
return __pm_runtime_resume(dev, RPM_GET_PUT);
}
This is the message from 6.17.0-rc6-next-20250917-gpudebug-00028-gf99cf81b1da7 crashing,
captured via netconsole:
2025-10-06T04:52:35.932429+02:00 [T248]evmisc-0132 ev_queue_notify_reques: Dispatching Notify on [GPP0] (Device) Value 0x00 (Bus Check) Node 0000000069c9623b
2025-10-06T04:52:35.932429+02:00 [T177395]pcieport 0000:00:01.1: acpiphp_check_bridge 708#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [T177395]pcieport 0000:00:01.1: __pm_runtime_resume 1243 dev = ffff97c001c930c8#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: rpm_resume 818 dev = ffff97c001c930c8#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: pm_runtime_deactivate_timer 146#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: rpm_resume 930#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: rpm_resume 959#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: rpm_resume 818 dev = ffff97c001c930c8#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: pm_runtime_deactivate_timer 146#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
2025-10-06T04:52:35.932429+02:00 [177395]pcieport 0000:00:01.1: rpm_resume 965#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
So the crash seems to happen in this part of rpm_resume():
[...]
skip_parent:
if (!strcmp(dev_name(dev), "0000:00:01.1"))
dev_info(dev, "%s %d\n", __func__, __LINE__); // this is the last reported line
if (dev->power.no_callbacks)
goto no_callback; /* Assume success. */
__update_runtime_status(dev, RPM_RESUMING);
callback = RPM_GET_CALLBACK(dev, runtime_resume);
dev_pm_disable_wake_irq_check(dev, false);
retval = rpm_callback(callback, dev);
if (retval) {
if (!strcmp(dev_name(dev), "0000:00:01.1"))
dev_info(dev, "%s %d\n", __func__, __LINE__);
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev);
dev_pm_enable_wake_irq_check(dev, false);
} else {
no_callback:
if (!strcmp(dev_name(dev), "0000:00:01.1"))
dev_info(dev, "%s %d\n", __func__, __LINE__);
[...]
Bert Karwatzki
Powered by blists - more mailing lists