linux-kernel - [RFC v2 20/35] RPAL: add rpal_ret_from_lazy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <4cd58d0e989640f0c230196e81cec5cee0ceb476.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:48 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch

After lazy switch the task before the lazy switch will lose its user mode
context (which is passed to the task after the lazy switch). Therefore,
RPAL needs to handle the issue of the previous task losing its user mode
context.

After the lazy switch occurs, the sender can resume execution in two ways.
One way is to be scheduled by the scheduler. In this case, RPAL handles
this issue in a manner similar to ret_from_fork. the sender will enter
rpal_ret_from_lazy_switch through the constructed stack frame by lazy
switchto execute the return logic and finally return to the pre-defined
user mode (referred to as "kernel return"). The other way is to be switched
back to by the receiver through another lazy switch. In this case, the
receiver will pass the user mode context to the sender, so there is no need
to construct a user mode context for the sender. And the receiver can
return to the user mode through the kernel return method.

rpal_ret_from_lazy_switch primarily handles scheduler cleanup work, similar
to schedule_tail(), but does not perform set_child_tid-otherwise, it might
cause set_child_tid to be executed repeatedly. It then calls
rpal_kernel_ret(), which is primarily used to set the states of the sender
and receiver and attempt to unlock the CPU. Finally, it performs syscall
cleanup work and returns to user mode.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/entry/entry_64.S | 23 ++++++++++++++++++++
 arch/x86/rpal/core.c      | 45 +++++++++++++++++++++++++++++++++++++--
 include/linux/rpal.h      |  5 ++++-
 kernel/sched/core.c       | 25 +++++++++++++++++++++-
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..13b4d0684575 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -169,6 +169,29 @@ SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
 	int3
 SYM_CODE_END(entry_SYSCALL_64)
 
+#ifdef CONFIG_RPAL
+SYM_CODE_START(rpal_ret_from_lazy_switch)
+	UNWIND_HINT_END_OF_STACK
+	ANNOTATE_NOENDBR
+	movq	%rax, %rdi
+	call	rpal_schedule_tail
+
+	movq	%rsp, %rdi
+	call	rpal_kernel_ret
+
+	movq	%rsp, %rdi
+	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
+
+	UNWIND_HINT_REGS
+#ifdef CONFIG_X86_FRED
+	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
+	jmp	swapgs_restore_regs_and_return_to_usermode
+#endif
+SYM_CODE_END(rpal_ret_from_lazy_switch)
+#endif
+
 /*
  * %rdi: prev task
  * %rsi: next task
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 19c4ef38bca3..ed4c11e6838c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -18,7 +18,7 @@ unsigned long rpal_cap;
 
 static inline void rpal_lock_cpu(struct task_struct *tsk)
 {
-	rpal_set_cpus_allowed_ptr(tsk, true);
+	rpal_set_cpus_allowed_ptr(tsk, true, false);
 	if (unlikely(!irqs_disabled())) {
 		local_irq_disable();
 		rpal_err("%s: irq is enabled\n", __func__);
@@ -27,13 +27,54 @@ static inline void rpal_lock_cpu(struct task_struct *tsk)
 
 static inline void rpal_unlock_cpu(struct task_struct *tsk)
 {
-	rpal_set_cpus_allowed_ptr(tsk, false);
+	rpal_set_cpus_allowed_ptr(tsk, false, false);
 	if (unlikely(!irqs_disabled())) {
 		local_irq_disable();
 		rpal_err("%s: irq is enabled\n", __func__);
 	}
 }
 
+static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, false, true);
+}
+
+void rpal_kernel_ret(struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct rpal_receiver_call_context *rcc;
+	int state;
+
+	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
+		rcc = current->rpal_rd->rcc;
+		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
+	} else {
+		tsk = current->rpal_sd->receiver;
+		rcc = tsk->rpal_rd->rcc;
+		rpal_clear_task_thread_flag(tsk, RPAL_LAZY_SWITCHED_BIT);
+		state = atomic_xchg(&rcc->sender_state, RPAL_SENDER_STATE_KERNEL_RET);
+		WARN_ON_ONCE(state != RPAL_SENDER_STATE_CALL);
+		/* make sure kernel return is finished */
+		smp_mb();
+		WRITE_ONCE(tsk->rpal_rd->sender, NULL);
+		/*
+		 * We must unlock receiver first, otherwise we may unlock
+		 * receiver which is already locked by another sender.
+		 *
+		 *  Sender A			Receiver B      Sender C
+		 *	lazy switch (A->B)
+		 *  kernel return
+		 *      unlock cpu A
+		 *                      epoll_wait
+		 *                                         lazy switch(C->B)
+		 *                                         lock cpu B
+		 *		unlock cpu B
+		 *						BUG()			BUG()
+		 */
+		rpal_unlock_cpu_kernel_ret(tsk);
+		rpal_unlock_cpu_kernel_ret(current);
+	}
+}
 
 static inline struct task_struct *rpal_get_sender_task(void)
 {
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 0813db4552c0..01b582fa821e 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -480,14 +480,17 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
 					 unsigned long addr, int error_code);
 struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
 struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_kernel_ret(struct pt_regs *regs);
 
 extern void rpal_pick_mmap_base(struct mm_struct *mm,
 	struct rlimit *rlim_stack);
 int rpal_try_to_wake_up(struct task_struct *p);
 int rpal_init_thread_pending(struct rpal_common_data *rcd);
 void rpal_free_thread_pending(struct rpal_common_data *rcd);
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+	bool is_kernel_ret);
 void rpal_schedule(struct task_struct *next);
 asmlinkage struct task_struct *
 __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 760d88458b39..0f9343698198 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3181,7 +3181,8 @@ void rpal_free_thread_pending(struct rpal_common_data *rcd)
 /*
  * CPU lock is forced and all cpumask will be ignored by RPAL temporary.
  */
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+							 bool is_kernel_ret)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	struct set_affinity_pending *pending = p->rpal_cd->pending;
@@ -3210,6 +3211,9 @@ int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
 		rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
 	}
 
+	if (is_kernel_ret)
+		return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
 	update_rq_clock(rq);
 
 	if (cpumask_equal(&p->cpus_mask, ac.new_mask))
@@ -11011,6 +11015,25 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 
 #ifdef CONFIG_RPAL
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
+	finish_task_switch(prev);
+	trace_sched_exit_tp(true, CALLER_ADDR0);
+	preempt_enable();
+
+	calculate_sigpending();
+}
+
 static struct rq *rpal_finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
-- 
2.20.1