lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <924aa7959502c4c3271cb311632eb505e894e26e.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:49 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch

At the kernel entry point, RPAL performs a lazy switch. Therefore, it is
necessary to hook all kernel entry points to execute the logic related to
the lazy switch. At the kernel entry, apart from some necessary operations
related to the lazy switch (such as ensuring that the general-purpose
registers remain unchanged before and after the lazy switch), the task
before the lazy switch will lose its user mode context (which is passed to
the task after the lazy switch). Therefore, the kernel entry also needs to
handle the issue of the previous task losing its user mode context.

This patch hooks all locations where the transition from user mode to
kernel mode occurs, including entry_SYSCALL_64, error_entry, and
asm_exc_nmi. When the kernel detects a mismatch between the kernel-mode and
user mode contexts, it executes the logic related to the lazy switch.
Taking the switch from the sender to the receiver as an example, the
receiver thread is first locked to the CPU where the sender is located.
Then, the receiver thread in the CALL state is woken up through
rpal_try_to_wake_up(). The general purpose register state (pt_regs) of the
sender is copied to the receiver, and rpal_schedule() is executed to
complete the lazy switch. Regarding the issue of the sender losing its
context, the kernel loads the pre-saved user mode context of the sender
into the sender's pt_regs and constructs the kernel stack frame of the
sender in a manner similar to the fork operation.

The handling of the switch from the receiver to the sender is similar,
except that the receiver will be unlocked from the current CPU, and the
receiver can only return to the user mode through the kernel return method.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/entry/entry_64.S     | 137 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/asm-offsets.c |   3 +
 arch/x86/rpal/core.c          | 137 ++++++++++++++++++++++++++++++++++
 include/linux/rpal.h          |   6 ++
 4 files changed, 283 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 13b4d0684575..59c38627510d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -118,6 +118,20 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	UNTRAIN_RET
 	CLEAR_BRANCH_HISTORY
 
+#ifdef CONFIG_RPAL
+	/*
+	 * We first check if it is a RPAL sender/receiver with
+	 * current->rpal_cd. For non-RPAL task, we just skip it.
+	 * For rpal task, We may need to check if it needs to do
+	 * lazy switch.
+	 */
+	movq	PER_CPU_VAR(current_task), %r13
+	movq	TASK_rpal_cd(%r13), %rax
+	testq	%rax, %rax
+	jz		_do_syscall
+	jmp 	do_rpal_syscall
+_do_syscall:
+#endif
 	call	do_syscall_64		/* returns with IRQs disabled */
 
 	/*
@@ -190,6 +204,101 @@ SYM_CODE_START(rpal_ret_from_lazy_switch)
 	jmp	swapgs_restore_regs_and_return_to_usermode
 #endif
 SYM_CODE_END(rpal_ret_from_lazy_switch)
+
+/* return address offset of stack frame */
+#define RPAL_FRAME_RET_ADDR_OFFSET -56
+
+SYM_CODE_START(do_rpal_syscall)
+	movq	%rsp, %r14
+	call	rpal_syscall_64_context_switch
+	testq   %rax, %rax
+	jz		1f
+
+	/*
+	 * When we come here, everything but stack switching is finished.
+	 * This makes current task use another task's kernel stack. Thus,
+	 * we need to do stack switching here.
+	 *
+	 * At the meanwhile, the previous task's stack content is corrupted,
+	 * we also need to rebuild its stack frames, so that it will jump to
+	 * rpal_ret_from_lazy_switch when it is scheduled in. This is inspired
+	 * by ret_from_fork.
+	 */
+	movq    TASK_threadsp(%rax), %rsp
+#ifdef CONFIG_STACKPROTECTOR
+	movq	TASK_stack_canary(%rax), %rbx
+	movq	%rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+	/* rebuild src's frame */
+	movq	$rpal_ret_from_lazy_switch, -8(%r14)
+	leaq	RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+	movq	%rbx, TASK_threadsp(%r13)
+
+	movq	%r13, %rdi
+	/*
+	 * Everything of task switch is done, but we still need to do
+	 * a little extra things for lazy switch.
+	 */
+	call	rpal_lazy_switch_tail
+
+1:
+	movq	ORIG_RAX(%rsp), %rsi
+	movq	%rsp, %rdi
+	jmp		_do_syscall
+SYM_CODE_END(do_rpal_syscall)
+
+SYM_CODE_START(do_rpal_error)
+	popq	%r12
+	movq	%rax, %rsp
+	movq	%rax, %r14
+	movq	%rax, %rdi
+	call	rpal_exception_context_switch
+	testq   %rax, %rax
+	jz		1f
+
+	movq	TASK_threadsp(%rax), %rsp
+	ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+	movq	TASK_stack_canary(%rax), %rbx
+	movq	%rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+	/* rebuild src's frame */
+	movq	$rpal_ret_from_lazy_switch, -8(%r14)
+	leaq	RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+	movq	%rbx, TASK_threadsp(%r13)
+
+	movq	%r13, %rdi
+	call	rpal_lazy_switch_tail
+1:
+	movq	%rsp, %rax
+	pushq	%r12
+	jmp		_do_error
+SYM_CODE_END(do_rpal_error)
+
+SYM_CODE_START(do_rpal_nmi)
+	movq	%rsp, %r14
+	movq	%rsp, %rdi
+	call	rpal_nmi_context_switch
+	testq   %rax, %rax
+	jz		1f
+
+	movq    TASK_threadsp(%rax), %rsp
+	ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+	movq	TASK_stack_canary(%rax), %rbx
+	movq	%rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+	/* rebuild src's frame */
+	movq	$rpal_ret_from_lazy_switch, -8(%r14)
+	leaq	RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+	movq	%rbx, TASK_threadsp(%r13)
+
+	movq	%r13, %rdi
+	call	rpal_lazy_switch_tail
+
+1:
+	jmp		_do_nmi
+SYM_CODE_END(do_rpal_nmi)
 #endif
 
 /*
@@ -1047,7 +1156,22 @@ SYM_CODE_START(error_entry)
 
 	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
 	/* Put us onto the real thread stack. */
+#ifdef CONFIG_RPAL
+	call sync_regs
+	/*
+	 * Check whether we need to perform lazy switch after we
+	 * switch to the real thread stack.
+	 */
+	movq	PER_CPU_VAR(current_task), %r13
+	movq	TASK_rpal_cd(%r13), %rdi
+	testq	%rdi, %rdi
+	jz		_do_error
+	jmp 	do_rpal_error
+_do_error:
+	RET
+#else
 	jmp	sync_regs
+#endif
 
 	/*
 	 * There are two places in the kernel that can potentially fault with
@@ -1206,6 +1330,19 @@ SYM_CODE_START(asm_exc_nmi)
 	IBRS_ENTER
 	UNTRAIN_RET
 
+#ifdef CONFIG_RPAL
+	/*
+	 * Check whether we need to perform lazy switch only when
+	 * we come from userspace.
+	 */
+	movq	PER_CPU_VAR(current_task), %r13
+	movq	TASK_rpal_cd(%r13), %rax
+	testq	%rax, %rax
+	jz		_do_nmi
+	jmp 	do_rpal_nmi
+_do_nmi:
+#endif
+
 	/*
 	 * At this point we no longer need to worry about stack damage
 	 * due to nesting -- we're on the normal thread stack and we're
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6259b474073b..010202c31b37 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -46,6 +46,9 @@ static void __used common(void)
 #ifdef CONFIG_STACKPROTECTOR
 	OFFSET(TASK_stack_canary, task_struct, stack_canary);
 #endif
+#ifdef CONFIG_RPAL
+	OFFSET(TASK_rpal_cd, task_struct, rpal_cd);
+#endif
 
 	BLANK();
 	OFFSET(pbe_address, pbe, address);
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index ed4c11e6838c..c48df1ce4324 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/rpal.h>
+#include <linux/sched/task_stack.h>
 #include <asm/fsgsbase.h>
 
 #include "internal.h"
@@ -39,6 +40,20 @@ static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
 	rpal_set_cpus_allowed_ptr(tsk, false, true);
 }
 
+void rpal_lazy_switch_tail(struct task_struct *tsk)
+{
+	struct rpal_receiver_call_context *rcc;
+
+	if (rpal_test_task_thread_flag(current, RPAL_LAZY_SWITCHED_BIT)) {
+		rcc = current->rpal_rd->rcc;
+		atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
+			       RPAL_RECEIVER_STATE_LAZY_SWITCH);
+	} else {
+		rpal_unlock_cpu(tsk);
+		rpal_unlock_cpu(current);
+	}
+}
+
 void rpal_kernel_ret(struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -76,6 +91,87 @@ void rpal_kernel_ret(struct pt_regs *regs)
 	}
 }
 
+static inline void rebuild_stack(struct rpal_task_context *ctx,
+				 struct pt_regs *regs)
+{
+	regs->r12 = ctx->r12;
+	regs->r13 = ctx->r13;
+	regs->r14 = ctx->r14;
+	regs->r15 = ctx->r15;
+	regs->bx = ctx->rbx;
+	regs->bp = ctx->rbp;
+	regs->ip = ctx->rip;
+	regs->sp = ctx->rsp;
+}
+
+static inline void rebuild_sender_stack(struct rpal_sender_data *rsd,
+				 struct pt_regs *regs)
+{
+	rebuild_stack(&rsd->scc->rtc, regs);
+}
+
+static inline void rebuild_receiver_stack(struct rpal_receiver_data *rrd,
+				   struct pt_regs *regs)
+{
+	rebuild_stack(&rrd->rcc->rtc, regs);
+}
+
+static inline void update_dst_stack(struct task_struct *next,
+				    struct pt_regs *src)
+{
+	struct pt_regs *dst;
+
+	dst = task_pt_regs(next);
+	*dst = *src;
+	next->thread.sp = (unsigned long)dst;
+}
+
+/*
+ * rpal_do_kernel_context_switch - the main routine of RPAL lazy switch
+ * @next: task to switch to
+ * @regs: the user pt_regs saved in kernel entry
+ *
+ * This function performs the lazy switch. When switch from sender to
+ * receiver, we need to lock both task to current CPU to avoid double
+ * control flow when we perform lazy switch and after then.
+ */
+static struct task_struct *
+rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
+{
+	struct task_struct *prev = current;
+
+	if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+		current->rpal_sd->receiver = next;
+		rpal_lock_cpu(current);
+		rpal_lock_cpu(next);
+		rpal_try_to_wake_up(next);
+		update_dst_stack(next, regs);
+		/*
+		 * When a lazy switch occurs, we need to set the sender's
+		 * user-mode context to a predefined state by the sender.
+		 * Otherwise, sender's user context will be corrupted.
+		 */
+		rebuild_sender_stack(current->rpal_sd, regs);
+		rpal_schedule(next);
+	} else {
+		update_dst_stack(next, regs);
+		/*
+		 * When a lazy switch occurs, we need to set the receiver's
+		 * user-mode context to a predefined state by the receiver.
+		 * Otherwise, sender's user context will be corrupted.
+		 */
+		rebuild_receiver_stack(current->rpal_rd, regs);
+		rpal_schedule(next);
+		rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
+		prev->rpal_rd->sender = NULL;
+	}
+	if (unlikely(!irqs_disabled())) {
+		local_irq_disable();
+		rpal_err("%s: irq is enabled\n", __func__);
+	}
+	return next;
+}
+
 static inline struct task_struct *rpal_get_sender_task(void)
 {
 	struct task_struct *next;
@@ -123,6 +219,18 @@ static inline struct task_struct *rpal_misidentify(void)
 	return next;
 }
 
+static inline struct task_struct *
+rpal_kernel_context_switch(struct pt_regs *regs)
+{
+	struct task_struct *next = NULL;
+
+	next = rpal_misidentify();
+	if (unlikely(next != NULL))
+		next = rpal_do_kernel_context_switch(next, regs);
+
+	return next;
+}
+
 struct task_struct *rpal_find_next_task(unsigned long fsbase)
 {
 	struct rpal_service *cur = rpal_current_service();
@@ -147,6 +255,35 @@ struct task_struct *rpal_find_next_task(unsigned long fsbase)
 	return tsk;
 }
 
+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr)
+{
+	struct task_struct *next;
+
+	next = rpal_kernel_context_switch(regs);
+
+	return next;
+}
+
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs)
+{
+	struct task_struct *next;
+
+	next = rpal_kernel_context_switch(regs);
+
+	return next;
+}
+
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs)
+{
+	struct task_struct *next;
+
+	next = rpal_kernel_context_switch(regs);
+
+	return next;
+}
+
 static bool check_hardware_features(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) {
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 01b582fa821e..b24176f3f245 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -479,7 +479,13 @@ struct rpal_service *rpal_get_mapped_service_by_id(struct rpal_service *rs,
 int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
 					 unsigned long addr, int error_code);
 struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr);
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs);
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs);
 struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_lazy_switch_tail(struct task_struct *tsk);
 void rpal_kernel_ret(struct pt_regs *regs);
 
 extern void rpal_pick_mmap_base(struct mm_struct *mm,
-- 
2.20.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ