[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150504093347.GC3829@pd.tnic>
Date: Mon, 4 May 2015 11:33:47 +0200
From: Borislav Petkov <bp@...en8.de>
To: Ingo Molnar <mingo@...nel.org>
Cc: Tony Luck <tony.luck@...el.com>, Jiri Kosina <jkosina@...e.cz>,
x86-ml <x86@...nel.org>, linux-edac <linux-edac@...r.kernel.org>,
lkml <linux-kernel@...r.kernel.org>,
"Rafael J. Wysocki" <rjw@...ysocki.net>,
Len Brown <lenb@...nel.org>
Subject: [GIT PULL] RAS for 4.2
Hi Ingo,
please queue for 4.2.
Thanks.
@Len, @Rafael: Guys, I'm sending this through tip even though strictly
speaking it is ACPI.
I also am guessing to the point of being almost right, though, that
you're secretly happy I'm doing that because it is one less issue you
have to deal with. :-) :-)
Let me know if you still want to pick that up though.
Thanks.
---
The following changes since commit b787f68c36d49bb1d9236f403813641efa74a031:
Linux 4.1-rc1 (2015-04-26 17:59:10 -0700)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git tags/ras_for_4.2
for you to fetch changes up to 6fe9e7c26a97105645fd24f264f1b94e21aade3e:
GHES: Make NMI handler have a single reader (2015-04-27 21:35:33 +0200)
----------------------------------------------------------------
GHES: Seriously speedup and cleanup NMI handler (Jiri Kosina and Borislav Petkov)
This is the result of us seeing this during boot
[ 24.332560] INFO: NMI handler (ghes_notify_nmi) took too long to run: 3.265 msecs
[ 24.332567] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.946 msecs
[ 24.332568] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.948 msecs
and a report of people running perf stat and the machine softlocking.
"hest_disable" was helping in this case, which made us look at that
particular NMI handler. It was grabbing a lock each time it is run and
on each CPU. But this is not needed as the GHES sources are global and
they need only a single reader.
This patchset does that and cleans up the handler in the process.
----------------------------------------------------------------
Borislav Petkov (4):
GHES: Carve out error queueing in a separate function
GHES: Carve out the panic functionality
GHES: Panic right after detection
GHES: Elliminate double-loop in the NMI handler
Jiri Kosina (1):
GHES: Make NMI handler have a single reader
drivers/acpi/apei/ghes.c | 108 ++++++++++++++++++++++++-----------------------
1 file changed, 55 insertions(+), 53 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e82d0976a5d0..2bfd53cbfe80 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -729,10 +729,10 @@ static struct llist_head ghes_estatus_llist;
static struct irq_work ghes_proc_irq_work;
/*
- * NMI may be triggered on any CPU, so ghes_nmi_lock is used for
- * mutual exclusion.
+ * NMI may be triggered on any CPU, so ghes_in_nmi is used for
+ * having only one concurrent reader.
*/
-static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
+static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
static LIST_HEAD(ghes_nmi);
@@ -797,73 +797,75 @@ static void ghes_print_queued_estatus(void)
}
}
+/* Save estatus for further processing in IRQ context */
+static void __process_error(struct ghes *ghes)
+{
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ u32 len, node_len;
+ struct ghes_estatus_node *estatus_node;
+ struct acpi_hest_generic_status *estatus;
+
+ if (ghes_estatus_cached(ghes->estatus))
+ return;
+
+ len = cper_estatus_len(ghes->estatus);
+ node_len = GHES_ESTATUS_NODE_LEN(len);
+
+ estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
+ if (!estatus_node)
+ return;
+
+ estatus_node->ghes = ghes;
+ estatus_node->generic = ghes->generic;
+ estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+ memcpy(estatus, ghes->estatus, len);
+ llist_add(&estatus_node->llnode, &ghes_estatus_llist);
+#endif
+}
+
+static void __ghes_panic(struct ghes *ghes)
+{
+ oops_begin();
+ ghes_print_queued_estatus();
+ __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
+
+ /* reboot to log the error! */
+ if (panic_timeout == 0)
+ panic_timeout = ghes_panic_timeout;
+ panic("Fatal hardware error!");
+}
+
static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
{
- struct ghes *ghes, *ghes_global = NULL;
- int sev, sev_global = -1;
- int ret = NMI_DONE;
+ struct ghes *ghes;
+ int sev, ret = NMI_DONE;
+
+ if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
+ return ret;
- raw_spin_lock(&ghes_nmi_lock);
list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
if (ghes_read_estatus(ghes, 1)) {
ghes_clear_estatus(ghes);
continue;
}
- sev = ghes_severity(ghes->estatus->error_severity);
- if (sev > sev_global) {
- sev_global = sev;
- ghes_global = ghes;
- }
- ret = NMI_HANDLED;
- }
-
- if (ret == NMI_DONE)
- goto out;
- if (sev_global >= GHES_SEV_PANIC) {
- oops_begin();
- ghes_print_queued_estatus();
- __ghes_print_estatus(KERN_EMERG, ghes_global->generic,
- ghes_global->estatus);
- /* reboot to log the error! */
- if (panic_timeout == 0)
- panic_timeout = ghes_panic_timeout;
- panic("Fatal hardware error!");
- }
+ sev = ghes_severity(ghes->estatus->error_severity);
+ if (sev >= GHES_SEV_PANIC)
+ __ghes_panic(ghes);
- list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
- u32 len, node_len;
- struct ghes_estatus_node *estatus_node;
- struct acpi_hest_generic_status *estatus;
-#endif
if (!(ghes->flags & GHES_TO_CLEAR))
continue;
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
- if (ghes_estatus_cached(ghes->estatus))
- goto next;
- /* Save estatus for further processing in IRQ context */
- len = cper_estatus_len(ghes->estatus);
- node_len = GHES_ESTATUS_NODE_LEN(len);
- estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
- node_len);
- if (estatus_node) {
- estatus_node->ghes = ghes;
- estatus_node->generic = ghes->generic;
- estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
- memcpy(estatus, ghes->estatus, len);
- llist_add(&estatus_node->llnode, &ghes_estatus_llist);
- }
-next:
-#endif
+
+ __process_error(ghes);
ghes_clear_estatus(ghes);
+
+ ret = NMI_HANDLED;
}
+
#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
irq_work_queue(&ghes_proc_irq_work);
#endif
-
-out:
- raw_spin_unlock(&ghes_nmi_lock);
+ atomic_dec(&ghes_in_nmi);
return ret;
}
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists