linux-kernel - [PATCH v10 4/4] ACPI: APEI: handle synchronous exceptions in task work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20231218064521.37324-5-xueshuai@linux.alibaba.com>
Date: Mon, 18 Dec 2023 14:45:21 +0800
From: Shuai Xue <xueshuai@...ux.alibaba.com>
To: bp@...en8.de,
	rafael@...nel.org,
	wangkefeng.wang@...wei.com,
	tanxiaofei@...wei.com,
	mawupeng1@...wei.com,
	tony.luck@...el.com,
	linmiaohe@...wei.com,
	naoya.horiguchi@....com,
	james.morse@....com,
	gregkh@...uxfoundation.org,
	will@...nel.org,
	jarkko@...nel.org
Cc: linux-acpi@...r.kernel.org,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	akpm@...ux-foundation.org,
	linux-edac@...r.kernel.org,
	acpica-devel@...ts.linuxfoundation.org,
	stable@...r.kernel.org,
	x86@...nel.org,
	xueshuai@...ux.alibaba.com,
	justin.he@....com,
	ardb@...nel.org,
	ying.huang@...el.com,
	ashish.kalra@....com,
	baolin.wang@...ux.alibaba.com,
	tglx@...utronix.de,
	mingo@...hat.com,
	dave.hansen@...ux.intel.com,
	lenb@...nel.org,
	hpa@...or.com,
	robert.moore@...el.com,
	lvying6@...wei.com,
	xiexiuqi@...wei.com,
	zhuo.song@...ux.alibaba.com
Subject: [PATCH v10 4/4] ACPI: APEI: handle synchronous exceptions in task work

Hardware errors could be signaled by asynchronous interrupt, e.g. when an
error is detected by a background scrubber, or signaled by synchronous
exception, e.g. when a CPU tries to access a poisoned cache line. Both
synchronous and asynchronous error are queued as a memory_failure() work
and handled by a dedicated kthread in workqueue.

However, the memory failure recovery sends SIBUS with wrong BUS_MCEERR_AO
si_code for synchronous errors in early kill mode, even MF_ACTION_REQUIRED
is set. The main problem is that the memory failure work is handled in
kthread context but not the user-space process which is accessing the
corrupt memory location, so it will send SIGBUS with BUS_MCEERR_AO si_code
to the user-space process instead of BUS_MCEERR_AR in kill_proc().

To this end, queue memory_failure() as a task_work so that the current
context in memory_failure() is exactly belongs to the process consuming
poison data and it will send SIBBUS with proper si_code.

Signed-off-by: Shuai Xue <xueshuai@...ux.alibaba.com>
Tested-by: Ma Wupeng <mawupeng1@...wei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@...wei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@...wei.com>
Reviewed-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
---
 drivers/acpi/apei/ghes.c | 77 +++++++++++++++++++++++-----------------
 include/acpi/ghes.h      |  3 --
 mm/memory-failure.c      | 13 -------
 3 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f832ffc5a88d..a6b4907cfe47 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -464,28 +464,41 @@ static void ghes_clear_estatus(struct ghes *ghes,
 }
 
 /*
- * Called as task_work before returning to user-space.
- * Ensure any queued work has been done before we return to the context that
- * triggered the notification.
+ * struct sync_task_work - for synchronous RAS event
+ *
+ * @twork:                callback_head for task work
+ * @pfn:                  page frame number of corrupted page
+ * @flags:                fine tune action taken
+ *
+ * Structure to pass task work to be handled before
+ * ret_to_user via task_work_add().
  */
-static void ghes_kick_task_work(struct callback_head *head)
+struct sync_task_work {
+	struct callback_head twork;
+	u64 pfn;
+	int flags;
+};
+
+static void memory_failure_cb(struct callback_head *twork)
 {
-	struct acpi_hest_generic_status *estatus;
-	struct ghes_estatus_node *estatus_node;
-	u32 node_len;
+	int ret;
+	struct sync_task_work *twcb =
+		container_of(twork, struct sync_task_work, twork);
 
-	estatus_node = container_of(head, struct ghes_estatus_node, task_work);
-	if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
-		memory_failure_queue_kick(estatus_node->task_work_cpu);
+	ret = memory_failure(twcb->pfn, twcb->flags);
+	gen_pool_free(ghes_estatus_pool, (unsigned long)twcb, sizeof(*twcb));
 
-	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
-	node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus));
-	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+	if (!ret || ret == -EHWPOISON || ret == -EOPNOTSUPP)
+		return;
+
+	pr_err("Sending SIGBUS to current task due to memory error not recovered");
+	force_sig(SIGBUS);
 }
 
 static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
 	unsigned long pfn;
+	struct sync_task_work *twcb;
 
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
 		return false;
@@ -498,6 +511,18 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 		return false;
 	}
 
+	if (flags == MF_ACTION_REQUIRED && current->mm) {
+		twcb = (void *)gen_pool_alloc(ghes_estatus_pool, sizeof(*twcb));
+		if (!twcb)
+			return false;
+
+		twcb->pfn = pfn;
+		twcb->flags = flags;
+		init_task_work(&twcb->twork, memory_failure_cb);
+		task_work_add(current, &twcb->twork, TWA_RESUME);
+		return true;
+	}
+
 	memory_failure_queue(pfn, flags);
 	return true;
 }
@@ -673,7 +698,7 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
-static bool ghes_do_proc(struct ghes *ghes,
+static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
 	int sev, sec_sev;
@@ -725,8 +750,6 @@ static bool ghes_do_proc(struct ghes *ghes,
 		pr_err("Sending SIGBUS to current task due to memory error not recovered");
 		force_sig(SIGBUS);
 	}
-
-	return queued;
 }
 
 static void __ghes_print_estatus(const char *pfx,
@@ -1028,9 +1051,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 	struct ghes_estatus_node *estatus_node;
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
-	bool task_work_pending;
 	u32 len, node_len;
-	int ret;
 
 	llnode = llist_del_all(&ghes_estatus_llist);
 	/*
@@ -1045,25 +1066,16 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 		len = cper_estatus_len(estatus);
 		node_len = GHES_ESTATUS_NODE_LEN(len);
-		task_work_pending = ghes_do_proc(estatus_node->ghes, estatus);
+
+		ghes_do_proc(estatus_node->ghes, estatus);
+
 		if (!ghes_estatus_cached(estatus)) {
 			generic = estatus_node->generic;
 			if (ghes_print_estatus(NULL, generic, estatus))
 				ghes_estatus_cache_add(generic, estatus);
 		}
-
-		if (task_work_pending && current->mm) {
-			estatus_node->task_work.func = ghes_kick_task_work;
-			estatus_node->task_work_cpu = smp_processor_id();
-			ret = task_work_add(current, &estatus_node->task_work,
-					    TWA_RESUME);
-			if (ret)
-				estatus_node->task_work.func = NULL;
-		}
-
-		if (!estatus_node->task_work.func)
-			gen_pool_free(ghes_estatus_pool,
-				      (unsigned long)estatus_node, node_len);
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+			      node_len);
 
 		llnode = next;
 	}
@@ -1124,7 +1136,6 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
 
 	estatus_node->ghes = ghes;
 	estatus_node->generic = ghes->generic;
-	estatus_node->task_work.func = NULL;
 	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 
 	if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index be1dd4c1a917..ebd21b05fe6e 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -35,9 +35,6 @@ struct ghes_estatus_node {
 	struct llist_node llnode;
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes;
-
-	int task_work_cpu;
-	struct callback_head task_work;
 };
 
 struct ghes_estatus_cache {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bd3dcafdfa4a..6bff57444928 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2451,19 +2451,6 @@ static void memory_failure_work_func(struct work_struct *work)
 	}
 }
 
-/*
- * Process memory_failure work queued on the specified CPU.
- * Used to avoid return-to-userspace racing with the memory_failure workqueue.
- */
-void memory_failure_queue_kick(int cpu)
-{
-	struct memory_failure_cpu *mf_cpu;
-
-	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
-	cancel_work_sync(&mf_cpu->work);
-	memory_failure_work_func(&mf_cpu->work);
-}
-
 static int __init memory_failure_init(void)
 {
 	struct memory_failure_cpu *mf_cpu;
-- 
2.39.3