linux-kernel - Re: Suspend-resume failure on Intel Eagle Lake Core2Duo

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.20.1707272306310.2368@nanos>
Date:   Thu, 27 Jul 2017 23:08:00 +0200 (CEST)
From:   Thomas Gleixner <tglx@...utronix.de>
To:     Tomi Sarvela <tomi.p.sarvela@...el.com>
cc:     Martin Peres <martin.peres@...ux.intel.com>,
        jeffy.chen@...k-chips.com, linux-kernel@...r.kernel.org
Subject: Re: Suspend-resume failure on Intel Eagle Lake Core2Duo

On Thu, 27 Jul 2017, Thomas Gleixner wrote:
> On Thu, 27 Jul 2017, Thomas Gleixner wrote:
> > On Thu, 27 Jul 2017, Tomi Sarvela wrote:
> > 
> > > On 27/07/17 10:42, Thomas Gleixner wrote:
> > > > On Thu, 27 Jul 2017, Tomi Sarvela wrote:
> > > > > On 26/07/17 17:26, Thomas Gleixner wrote:
> > > > > > So reverting that commit does not help. Does it help on your machine?
> > > > > 
> > > > > Yes. Reverting it does not cause the machine to lock up on resume.
> > > > > 
> > > > > I haven't tested if the machine locks up later on, but at least it
> > > > > survives
> > > > > couple of s/r cycles.
> > > > 
> > > > Can you please try to add 'nohpet' to the kernel command line?
> > > 
> > > Option nohpet didn't change anything, still hangs on s/r.
> > 
> > Ok. Was a shot in the dark. I tried on a similar machine, but that one
> > resumes fine (except that the AHCI controller plays silly buggers, but
> > nothing interrupt related). I might have access to another core2duo machine
> > tomorrow.
> > 
> > I'll send you a debug patch shortly, but can you please first check when
> > the wreckage happens by testing the states in
> > 
> >     /sys/power/pm_test
> > 
> > freezer
> > devices
> > platform
> > processors
> > core
> 
> Actually for suspend to ram we only have
> 
> 	 freezer, devices, platform
> 
> I assume it's platform because that is where the actual interrupt
> suspend/resume happens.
> 
> If that survives, then it's the low level architecture s/r code which
> fiddles with the interrupt controllers and leaves them in a state which is
> not known to the core code.

Debug patch below. It should make the machine resume again. Emphasis on
"should". Please provide the output of /sys/kernel/debug/tracing/trace
after resume.

Thanks,

	tglx

8<-----------

--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -304,7 +304,10 @@ void irq_shutdown(struct irq_desc *desc)
 
 void irq_enable(struct irq_desc *desc)
 {
-	if (!irqd_irq_disabled(&desc->irq_data)) {
+	if (irq_suspend_resume)
+		irq_trace_state("preenable", desc);
+
+	if (!irqd_irq_disabled(&desc->irq_data) && !irq_suspend_resume) {
 		unmask_irq(desc);
 	} else {
 		irq_state_clr_disabled(desc);
@@ -315,10 +318,16 @@ void irq_enable(struct irq_desc *desc)
 			unmask_irq(desc);
 		}
 	}
+
+	if (irq_suspend_resume)
+		irq_trace_state("postenable", desc);
 }
 
 static void __irq_disable(struct irq_desc *desc, bool mask)
 {
+	if (irq_suspend_resume)
+		irq_trace_state("predisable", desc);
+
 	if (irqd_irq_disabled(&desc->irq_data)) {
 		if (mask)
 			mask_irq(desc);
@@ -331,6 +340,9 @@ static void __irq_disable(struct irq_des
 			mask_irq(desc);
 		}
 	}
+
+	if (irq_suspend_resume)
+		irq_trace_state("postdisable", desc);
 }
 
 /**
@@ -390,6 +402,9 @@ static inline void mask_ack_irq(struct i
 
 void mask_irq(struct irq_desc *desc)
 {
+	if (irq_suspend_resume)
+		irq_trace_state("premask", desc);
+
 	if (irqd_irq_masked(&desc->irq_data))
 		return;
 
@@ -397,17 +412,26 @@ void mask_irq(struct irq_desc *desc)
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
 		irq_state_set_masked(desc);
 	}
+
+	if (irq_suspend_resume)
+		irq_trace_state("postmask", desc);
 }
 
 void unmask_irq(struct irq_desc *desc)
 {
-	if (!irqd_irq_masked(&desc->irq_data))
+	if (irq_suspend_resume)
+		irq_trace_state("preunmask", desc);
+
+	if (!irqd_irq_masked(&desc->irq_data) && !irq_suspend_resume)
 		return;
 
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 		irq_state_clr_masked(desc);
 	}
+
+	if (irq_suspend_resume)
+		irq_trace_state("postunmask", desc);
 }
 
 void unmask_threaded_irq(struct irq_desc *desc)
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -459,3 +459,11 @@ static inline void irq_remove_debugfs_en
 {
 }
 #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
+
+bool irq_suspend_resume;
+
+static inline void irq_trace_state(const char *what, struct irq_desc *desc)
+{
+	trace_printk("%s %d state %08x\n", what, irq_desc_get_irq(desc),
+		     irqd_get(&desc->irq_data));
+}
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -14,6 +14,8 @@
 
 #include "internals.h"
 
+bool irq_suspend_resume;
+
 bool irq_pm_check_wakeup(struct irq_desc *desc)
 {
 	if (irqd_is_wakeup_armed(&desc->irq_data)) {
@@ -120,6 +122,7 @@ void suspend_device_irqs(void)
 	struct irq_desc *desc;
 	int irq;
 
+	irq_suspend_resume = true;
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;
 		bool sync;
@@ -127,7 +130,9 @@ void suspend_device_irqs(void)
 		if (irq_settings_is_nested_thread(desc))
 			continue;
 		raw_spin_lock_irqsave(&desc->lock, flags);
+		irq_trace_state("presuspend", desc);
 		sync = suspend_device_irq(desc);
+		irq_trace_state("postsuspend", desc);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 		if (sync)
@@ -172,9 +177,14 @@ static void resume_irqs(bool want_early)
 			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
+		irq_trace_state("preresume", desc);
 		resume_irq(desc);
+		irq_trace_state("postresume", desc);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
+
+	if (!want_early)
+		irq_suspend_resume = false;
 }
 
 /**