[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <924aa7959502c4c3271cb311632eb505e894e26e.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:49 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
x86@...nel.org,
luto@...nel.org,
kees@...nel.org,
akpm@...ux-foundation.org,
david@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
peterz@...radead.org
Cc: dietmar.eggemann@....com,
hpa@...or.com,
acme@...nel.org,
namhyung@...nel.org,
mark.rutland@....com,
alexander.shishkin@...ux.intel.com,
jolsa@...nel.org,
irogers@...gle.com,
adrian.hunter@...el.com,
kan.liang@...ux.intel.com,
viro@...iv.linux.org.uk,
brauner@...nel.org,
jack@...e.cz,
lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com,
vbabka@...e.cz,
rppt@...nel.org,
surenb@...gle.com,
mhocko@...e.com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
jannh@...gle.com,
pfalcato@...e.de,
riel@...riel.com,
harry.yoo@...cle.com,
linux-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org,
linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org,
duanxiongchun@...edance.com,
yinhongbo@...edance.com,
dengliang.1214@...edance.com,
xieyongji@...edance.com,
chaiwen.cc@...edance.com,
songmuchun@...edance.com,
yuanzhu@...edance.com,
chengguozhu@...edance.com,
sunjiadong.lff@...edance.com,
Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch
At the kernel entry point, RPAL performs a lazy switch. Therefore, it is
necessary to hook all kernel entry points to execute the logic related to
the lazy switch. At the kernel entry, apart from some necessary operations
related to the lazy switch (such as ensuring that the general-purpose
registers remain unchanged before and after the lazy switch), the task
before the lazy switch will lose its user mode context (which is passed to
the task after the lazy switch). Therefore, the kernel entry also needs to
handle the issue of the previous task losing its user mode context.
This patch hooks all locations where the transition from user mode to
kernel mode occurs, including entry_SYSCALL_64, error_entry, and
asm_exc_nmi. When the kernel detects a mismatch between the kernel-mode and
user mode contexts, it executes the logic related to the lazy switch.
Taking the switch from the sender to the receiver as an example, the
receiver thread is first locked to the CPU where the sender is located.
Then, the receiver thread in the CALL state is woken up through
rpal_try_to_wake_up(). The general purpose register state (pt_regs) of the
sender is copied to the receiver, and rpal_schedule() is executed to
complete the lazy switch. Regarding the issue of the sender losing its
context, the kernel loads the pre-saved user mode context of the sender
into the sender's pt_regs and constructs the kernel stack frame of the
sender in a manner similar to the fork operation.
The handling of the switch from the receiver to the sender is similar,
except that the receiver will be unlocked from the current CPU, and the
receiver can only return to the user mode through the kernel return method.
Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
arch/x86/entry/entry_64.S | 137 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/asm-offsets.c | 3 +
arch/x86/rpal/core.c | 137 ++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 6 ++
4 files changed, 283 insertions(+)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 13b4d0684575..59c38627510d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -118,6 +118,20 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
UNTRAIN_RET
CLEAR_BRANCH_HISTORY
+#ifdef CONFIG_RPAL
+ /*
+ * We first check if it is a RPAL sender/receiver with
+ * current->rpal_cd. For non-RPAL task, we just skip it.
+ * For rpal task, We may need to check if it needs to do
+ * lazy switch.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rax
+ testq %rax, %rax
+ jz _do_syscall
+ jmp do_rpal_syscall
+_do_syscall:
+#endif
call do_syscall_64 /* returns with IRQs disabled */
/*
@@ -190,6 +204,101 @@ SYM_CODE_START(rpal_ret_from_lazy_switch)
jmp swapgs_restore_regs_and_return_to_usermode
#endif
SYM_CODE_END(rpal_ret_from_lazy_switch)
+
+/* return address offset of stack frame */
+#define RPAL_FRAME_RET_ADDR_OFFSET -56
+
+SYM_CODE_START(do_rpal_syscall)
+ movq %rsp, %r14
+ call rpal_syscall_64_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ /*
+ * When we come here, everything but stack switching is finished.
+ * This makes current task use another task's kernel stack. Thus,
+ * we need to do stack switching here.
+ *
+ * At the meanwhile, the previous task's stack content is corrupted,
+ * we also need to rebuild its stack frames, so that it will jump to
+ * rpal_ret_from_lazy_switch when it is scheduled in. This is inspired
+ * by ret_from_fork.
+ */
+ movq TASK_threadsp(%rax), %rsp
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ /*
+ * Everything of task switch is done, but we still need to do
+ * a little extra things for lazy switch.
+ */
+ call rpal_lazy_switch_tail
+
+1:
+ movq ORIG_RAX(%rsp), %rsi
+ movq %rsp, %rdi
+ jmp _do_syscall
+SYM_CODE_END(do_rpal_syscall)
+
+SYM_CODE_START(do_rpal_error)
+ popq %r12
+ movq %rax, %rsp
+ movq %rax, %r14
+ movq %rax, %rdi
+ call rpal_exception_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ movq TASK_threadsp(%rax), %rsp
+ ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ call rpal_lazy_switch_tail
+1:
+ movq %rsp, %rax
+ pushq %r12
+ jmp _do_error
+SYM_CODE_END(do_rpal_error)
+
+SYM_CODE_START(do_rpal_nmi)
+ movq %rsp, %r14
+ movq %rsp, %rdi
+ call rpal_nmi_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ movq TASK_threadsp(%rax), %rsp
+ ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ call rpal_lazy_switch_tail
+
+1:
+ jmp _do_nmi
+SYM_CODE_END(do_rpal_nmi)
#endif
/*
@@ -1047,7 +1156,22 @@ SYM_CODE_START(error_entry)
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
/* Put us onto the real thread stack. */
+#ifdef CONFIG_RPAL
+ call sync_regs
+ /*
+ * Check whether we need to perform lazy switch after we
+ * switch to the real thread stack.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rdi
+ testq %rdi, %rdi
+ jz _do_error
+ jmp do_rpal_error
+_do_error:
+ RET
+#else
jmp sync_regs
+#endif
/*
* There are two places in the kernel that can potentially fault with
@@ -1206,6 +1330,19 @@ SYM_CODE_START(asm_exc_nmi)
IBRS_ENTER
UNTRAIN_RET
+#ifdef CONFIG_RPAL
+ /*
+ * Check whether we need to perform lazy switch only when
+ * we come from userspace.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rax
+ testq %rax, %rax
+ jz _do_nmi
+ jmp do_rpal_nmi
+_do_nmi:
+#endif
+
/*
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6259b474073b..010202c31b37 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -46,6 +46,9 @@ static void __used common(void)
#ifdef CONFIG_STACKPROTECTOR
OFFSET(TASK_stack_canary, task_struct, stack_canary);
#endif
+#ifdef CONFIG_RPAL
+ OFFSET(TASK_rpal_cd, task_struct, rpal_cd);
+#endif
BLANK();
OFFSET(pbe_address, pbe, address);
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index ed4c11e6838c..c48df1ce4324 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -7,6 +7,7 @@
*/
#include <linux/rpal.h>
+#include <linux/sched/task_stack.h>
#include <asm/fsgsbase.h>
#include "internal.h"
@@ -39,6 +40,20 @@ static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
rpal_set_cpus_allowed_ptr(tsk, false, true);
}
+void rpal_lazy_switch_tail(struct task_struct *tsk)
+{
+ struct rpal_receiver_call_context *rcc;
+
+ if (rpal_test_task_thread_flag(current, RPAL_LAZY_SWITCHED_BIT)) {
+ rcc = current->rpal_rd->rcc;
+ atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
+ RPAL_RECEIVER_STATE_LAZY_SWITCH);
+ } else {
+ rpal_unlock_cpu(tsk);
+ rpal_unlock_cpu(current);
+ }
+}
+
void rpal_kernel_ret(struct pt_regs *regs)
{
struct task_struct *tsk;
@@ -76,6 +91,87 @@ void rpal_kernel_ret(struct pt_regs *regs)
}
}
+static inline void rebuild_stack(struct rpal_task_context *ctx,
+ struct pt_regs *regs)
+{
+ regs->r12 = ctx->r12;
+ regs->r13 = ctx->r13;
+ regs->r14 = ctx->r14;
+ regs->r15 = ctx->r15;
+ regs->bx = ctx->rbx;
+ regs->bp = ctx->rbp;
+ regs->ip = ctx->rip;
+ regs->sp = ctx->rsp;
+}
+
+static inline void rebuild_sender_stack(struct rpal_sender_data *rsd,
+ struct pt_regs *regs)
+{
+ rebuild_stack(&rsd->scc->rtc, regs);
+}
+
+static inline void rebuild_receiver_stack(struct rpal_receiver_data *rrd,
+ struct pt_regs *regs)
+{
+ rebuild_stack(&rrd->rcc->rtc, regs);
+}
+
+static inline void update_dst_stack(struct task_struct *next,
+ struct pt_regs *src)
+{
+ struct pt_regs *dst;
+
+ dst = task_pt_regs(next);
+ *dst = *src;
+ next->thread.sp = (unsigned long)dst;
+}
+
+/*
+ * rpal_do_kernel_context_switch - the main routine of RPAL lazy switch
+ * @next: task to switch to
+ * @regs: the user pt_regs saved in kernel entry
+ *
+ * This function performs the lazy switch. When switch from sender to
+ * receiver, we need to lock both task to current CPU to avoid double
+ * control flow when we perform lazy switch and after then.
+ */
+static struct task_struct *
+rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
+{
+ struct task_struct *prev = current;
+
+ if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ current->rpal_sd->receiver = next;
+ rpal_lock_cpu(current);
+ rpal_lock_cpu(next);
+ rpal_try_to_wake_up(next);
+ update_dst_stack(next, regs);
+ /*
+ * When a lazy switch occurs, we need to set the sender's
+ * user-mode context to a predefined state by the sender.
+ * Otherwise, sender's user context will be corrupted.
+ */
+ rebuild_sender_stack(current->rpal_sd, regs);
+ rpal_schedule(next);
+ } else {
+ update_dst_stack(next, regs);
+ /*
+ * When a lazy switch occurs, we need to set the receiver's
+ * user-mode context to a predefined state by the receiver.
+ * Otherwise, sender's user context will be corrupted.
+ */
+ rebuild_receiver_stack(current->rpal_rd, regs);
+ rpal_schedule(next);
+ rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
+ prev->rpal_rd->sender = NULL;
+ }
+ if (unlikely(!irqs_disabled())) {
+ local_irq_disable();
+ rpal_err("%s: irq is enabled\n", __func__);
+ }
+ return next;
+}
+
static inline struct task_struct *rpal_get_sender_task(void)
{
struct task_struct *next;
@@ -123,6 +219,18 @@ static inline struct task_struct *rpal_misidentify(void)
return next;
}
+static inline struct task_struct *
+rpal_kernel_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next = NULL;
+
+ next = rpal_misidentify();
+ if (unlikely(next != NULL))
+ next = rpal_do_kernel_context_switch(next, regs);
+
+ return next;
+}
+
struct task_struct *rpal_find_next_task(unsigned long fsbase)
{
struct rpal_service *cur = rpal_current_service();
@@ -147,6 +255,35 @@ struct task_struct *rpal_find_next_task(unsigned long fsbase)
return tsk;
}
+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
static bool check_hardware_features(void)
{
if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) {
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 01b582fa821e..b24176f3f245 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -479,7 +479,13 @@ struct rpal_service *rpal_get_mapped_service_by_id(struct rpal_service *rs,
int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long addr, int error_code);
struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr);
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs);
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs);
struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_lazy_switch_tail(struct task_struct *tsk);
void rpal_kernel_ret(struct pt_regs *regs);
extern void rpal_pick_mmap_base(struct mm_struct *mm,
--
2.20.1
Powered by blists - more mailing lists