[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251006120944.7880-3-spasswolf@web.de>
Date: Mon, 6 Oct 2025 14:09:41 +0200
From: Bert Karwatzki <spasswolf@....de>
To: linux-kernel@...r.kernel.org
Cc: Bert Karwatzki <spasswolf@....de>,
linux-next@...r.kernel.org,
linux-stable@...r.kernel.org,
regressions@...ts.linux.dev,
linux-pci@...r.kernel.org,
linux-acpi@...r.kernel.org,
Mario Limonciello <superm1@...nel.org>,
Christian König <christian.koenig@....com>,
"Rafael J . Wysocki" <rafael.j.wysocki@...el.com>
Subject: [REGRESSION 02/04] Crash during resume of pcie bridge
The next step is to monitor pm_runtime_get_sync(), rpm_resume() and __pm_runtime_resume().
Here we need to use conditional debugging output or else we get messages at a
rate of about a million lines per minute.
These is the additional debugging used in
6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7420b9851fe0..895898c3cd56 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -787,12 +787,18 @@ static int rpm_resume(struct device *dev, int rpmflags)
struct device *parent = NULL;
int retval = 0;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
trace_rpm_resume(dev, rpmflags);
repeat:
if (dev->power.runtime_error) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = -EINVAL;
} else if (dev->power.disable_depth > 0) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.runtime_status == RPM_ACTIVE &&
dev->power.last_status == RPM_ACTIVE)
retval = 1;
@@ -808,31 +814,45 @@ static int rpm_resume(struct device *dev, int rpmflags)
* rather than cancelling it now only to restart it again in the near
* future.
*/
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request = RPM_REQ_NONE;
if (!dev->power.timer_autosuspends)
pm_runtime_deactivate_timer(dev);
if (dev->power.runtime_status == RPM_ACTIVE) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = 1;
goto out;
}
if (dev->power.runtime_status == RPM_RESUMING ||
dev->power.runtime_status == RPM_SUSPENDING) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
DEFINE_WAIT(wait);
if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.runtime_status == RPM_SUSPENDING) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.deferred_resume = true;
if (rpmflags & RPM_NOWAIT)
retval = -EINPROGRESS;
} else {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = -EINPROGRESS;
}
goto out;
}
if (dev->power.irq_safe) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_unlock(&dev->power.lock);
cpu_relax();
@@ -856,6 +876,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
spin_lock_irq(&dev->power.lock);
}
finish_wait(&dev->power.wait_queue, &wait);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
goto repeat;
}
@@ -865,22 +887,32 @@ static int rpm_resume(struct device *dev, int rpmflags)
* the resume will actually succeed.
*/
if (dev->power.no_callbacks && !parent && dev->parent) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
if (dev->parent->power.disable_depth > 0 ||
dev->parent->power.ignore_children ||
dev->parent->power.runtime_status == RPM_ACTIVE) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
atomic_inc(&dev->parent->power.child_count);
spin_unlock(&dev->parent->power.lock);
retval = 1;
goto no_callback; /* Assume success. */
}
spin_unlock(&dev->parent->power.lock);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
}
/* Carry out an asynchronous or a synchronous resume. */
if (rpmflags & RPM_ASYNC) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request = RPM_REQ_RESUME;
if (!dev->power.request_pending) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
dev->power.request_pending = true;
queue_work(pm_wq, &dev->power.work);
}
@@ -894,6 +926,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
* necessary. Not needed if dev is irq-safe; then the
* parent is permanently resumed.
*/
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
parent = dev->parent;
if (dev->power.irq_safe)
goto skip_parent;
@@ -909,6 +943,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
*/
if (!parent->power.disable_depth &&
!parent->power.ignore_children) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
rpm_resume(parent, 0);
if (parent->power.runtime_status != RPM_ACTIVE)
retval = -EBUSY;
@@ -919,10 +955,14 @@ static int rpm_resume(struct device *dev, int rpmflags)
if (retval)
goto out;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
goto repeat;
}
skip_parent:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (dev->power.no_callbacks)
goto no_callback; /* Assume success. */
@@ -933,11 +973,15 @@ static int rpm_resume(struct device *dev, int rpmflags)
dev_pm_disable_wake_irq_check(dev, false);
retval = rpm_callback(callback, dev);
if (retval) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev);
dev_pm_enable_wake_irq_check(dev, false);
} else {
no_callback:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
__update_runtime_status(dev, RPM_ACTIVE);
pm_runtime_mark_last_busy(dev);
if (parent)
@@ -949,7 +993,11 @@ static int rpm_resume(struct device *dev, int rpmflags)
rpm_idle(dev, RPM_ASYNC);
out:
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (parent && !dev->power.irq_safe) {
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_unlock_irq(&dev->power.lock);
pm_runtime_put(parent);
@@ -959,6 +1007,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
trace_rpm_return_int(dev, _THIS_IP_, retval);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
return retval;
}
@@ -1181,17 +1231,27 @@ int __pm_runtime_resume(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
dev->power.runtime_status != RPM_ACTIVE);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
if (rpmflags & RPM_GET_PUT)
atomic_inc(&dev->power.usage_count);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
spin_lock_irqsave(&dev->power.lock, flags);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
retval = rpm_resume(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s %d\n", __func__, __LINE__);
return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_resume);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d88d6b6ccf5b..0888b0d5ec73 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -508,6 +508,8 @@ static inline int pm_runtime_get(struct device *dev)
*/
static inline int pm_runtime_get_sync(struct device *dev)
{
+ if (!strcmp(dev_name(dev), "0000:00:01.1"))
+ dev_info(dev, "%s\n", __func__);
return __pm_runtime_resume(dev, RPM_GET_PUT);
}
With this there is no crash after 60h uptime with ~3093 GPP0 notifies, probably the
printk()s are mitigating the crash in some way (i.e. there's a race and the printk()s
are slowing down only one side ...).
It would be nice if we could get a crash while all the printk()s are in place,
but I'm not sure if we can ...
Stopped 6.17.0-rc6-next-20250917-gpudebug-00024-g5c6b49b810db after 60h and 3093 GPP0 notifies without crash.
Bert Karwatzki
Powered by blists - more mailing lists