[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250129132148.301937-3-npiggin@gmail.com>
Date: Wed, 29 Jan 2025 23:21:43 +1000
From: Nicholas Piggin <npiggin@...il.com>
To: "Dmitry V. Levin" <ldv@...ace.io>
Cc: Nicholas Piggin <npiggin@...il.com>,
Oleg Nesterov <oleg@...hat.com>,
Eugene Syromyatnikov <evgsyr@...il.com>,
Mike Frysinger <vapier@...too.org>,
Renzo Davoli <renzo@...unibo.it>,
Davide Berardi <berardi.dav@...il.com>,
strace-devel@...ts.strace.io,
Madhavan Srinivasan <maddy@...ux.ibm.com>,
Christophe Leroy <christophe.leroy@...roup.eu>,
linuxppc-dev@...ts.ozlabs.org,
linux-kernel@...r.kernel.org
Subject: [RFC PATCH 2/2] powerpc/syscall: rework syscall return value handling
powerpc is one of the "interesting" archs when it comes to system call
error return convention. Rather than returning a -errno, it sets a
condition register bit and returns a +errno. To make matters more
complicated, a new system call instruction was introduced that uses the
-errno convention.
These error conventions are carried throughout the kernel in pt_regs,
which has lead to complexity in accessor functions and several subtle
breakages and surprising behaviour particularly around trace and bpf
code.
Can we change the way we do this and improve commonality with the big
archs by always using the "Linux convention" of -errno as the return
value, then just swapping that to powerpc convention in low level
powerpc syscall return code?
In theory this limits some "successful" values that can be returned by
system calls, however its the same as x86 and others, and recent 64-bit
powerpc that use scv 0 for the past few years have the same limitation
too.
Signed-off-by: Nicholas Piggin <npiggin@...il.com>
---
arch/powerpc/include/asm/ptrace.h | 13 +----
arch/powerpc/include/asm/syscall.h | 31 +---------
arch/powerpc/kernel/interrupt.c | 16 +++---
arch/powerpc/kernel/signal.c | 56 ++++++++++---------
tools/testing/selftests/seccomp/seccomp_bpf.c | 16 ++++++
5 files changed, 60 insertions(+), 72 deletions(-)
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 7b9350756875..b09b1524c74d 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -286,21 +286,12 @@ static __always_inline void set_trap_norestart(struct pt_regs *regs)
#define kernel_stack_pointer(regs) ((regs)->gpr[1])
static inline int is_syscall_success(struct pt_regs *regs)
{
- if (trap_is_scv(regs))
- return !IS_ERR_VALUE((unsigned long)regs->gpr[3]);
- else
- return !(regs->ccr & 0x10000000);
+ return !IS_ERR_VALUE((unsigned long)regs->gpr[3]);
}
static inline long regs_return_value(struct pt_regs *regs)
{
- if (trap_is_scv(regs))
- return regs->gpr[3];
-
- if (is_syscall_success(regs))
- return regs->gpr[3];
- else
- return -regs->gpr[3];
+ return regs->gpr[3];
}
static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index 3dd36c5e334a..020e9f5ed1ca 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -48,17 +48,8 @@ static inline void syscall_rollback(struct task_struct *task,
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
- if (trap_is_scv(regs)) {
- unsigned long error = regs->gpr[3];
-
- return IS_ERR_VALUE(error) ? error : 0;
- } else {
- /*
- * If the system call failed,
- * regs->gpr[3] contains a positive ERRORCODE.
- */
- return (regs->ccr & 0x10000000UL) ? -regs->gpr[3] : 0;
- }
+ unsigned long error = regs->gpr[3];
+ return IS_ERR_VALUE(error) ? error : 0;
}
static inline long syscall_get_return_value(struct task_struct *task,
@@ -71,23 +62,7 @@ static inline void syscall_set_return_value(struct task_struct *task,
struct pt_regs *regs,
int error, long val)
{
- if (trap_is_scv(regs)) {
- regs->gpr[3] = (long) error ?: val;
- } else {
- /*
- * In the general case it's not obvious that we must deal with
- * CCR here, as the syscall exit path will also do that for us.
- * However there are some places, eg. the signal code, which
- * check ccr to decide if the value in r3 is actually an error.
- */
- if (error) {
- regs->ccr |= 0x10000000L;
- regs->gpr[3] = error;
- } else {
- regs->ccr &= ~0x10000000L;
- regs->gpr[3] = val;
- }
- }
+ regs->gpr[3] = (long) error ?: val;
}
static inline void syscall_get_arguments(struct task_struct *task,
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 8f4acc55407b..18ee03fa50ae 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -277,13 +277,6 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
ti_flags = read_thread_flags();
- if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) {
- if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
- r3 = -r3;
- regs->ccr |= 0x10000000; /* Set SO bit in CR */
- }
- }
-
if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
if (ti_flags & _TIF_RESTOREALL)
ret = _TIF_RESTOREALL;
@@ -302,6 +295,15 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
local_irq_disable();
ret = interrupt_exit_user_prepare_main(ret, regs);
+ /* sc syscalls return error by setting CR0[SO] and +ve errno in r3 */
+ /* do_signal() has similar conversion */
+ if (unlikely(IS_ERR_VALUE(regs->gpr[3]) && is_not_scv)) {
+ if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
+ regs->gpr[3] = -regs->gpr[3];
+ regs->ccr |= 0x10000000; /* Set SO bit in CR */
+ }
+ }
+
#ifdef CONFIG_PPC64
regs->exit_result = ret;
#endif
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 193211b04805..b578d1e40b40 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -175,7 +175,7 @@ void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk,
return (void __user *)newsp;
}
-static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
+static bool check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
int has_handler)
{
unsigned long ret = regs->gpr[3];
@@ -183,60 +183,52 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
/* syscall ? */
if (!trap_is_syscall(regs))
- return;
+ return false;
if (trap_norestart(regs))
- return;
+ return false;
/* error signalled ? */
- if (trap_is_scv(regs)) {
- /* 32-bit compat mode sign extend? */
- if (!IS_ERR_VALUE(ret))
- return;
- ret = -ret;
- } else if (!(regs->ccr & 0x10000000)) {
- return;
- }
+ if (!IS_ERR_VALUE(ret)) /* Should we 32-bit compat mode sign extend? */
+ return false;
switch (ret) {
- case ERESTART_RESTARTBLOCK:
- case ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
/* ERESTARTNOHAND means that the syscall should only be
* restarted if there was no handler for the signal, and since
* we only get here if there is a handler, we dont restart.
*/
restart = !has_handler;
break;
- case ERESTARTSYS:
+ case -ERESTARTSYS:
/* ERESTARTSYS means to restart the syscall if there is no
* handler or the handler was registered with SA_RESTART
*/
restart = !has_handler || (ka->sa.sa_flags & SA_RESTART) != 0;
break;
- case ERESTARTNOINTR:
+ case -ERESTARTNOINTR:
/* ERESTARTNOINTR means that the syscall should be
* called again after the signal handler returns.
*/
break;
default:
- return;
+ return false;
}
if (restart) {
- if (ret == ERESTART_RESTARTBLOCK)
+ if (ret == -ERESTART_RESTARTBLOCK)
regs->gpr[0] = __NR_restart_syscall;
else
regs->gpr[3] = regs->orig_gpr3;
regs_add_return_ip(regs, -4);
regs->result = 0;
+
+ return true;
} else {
- if (trap_is_scv(regs)) {
- regs->result = -EINTR;
- regs->gpr[3] = -EINTR;
- } else {
- regs->result = -EINTR;
- regs->gpr[3] = EINTR;
- regs->ccr |= 0x10000000;
- }
+ regs->result = -EINTR;
+ regs->gpr[3] = -EINTR;
+
+ return false;
}
}
@@ -245,6 +237,7 @@ static void do_signal(struct task_struct *tsk)
struct pt_regs *regs = tsk->thread.regs;
sigset_t *oldset = sigmask_to_save();
struct ksignal ksig = { .sig = 0 };
+ bool restart;
int ret;
BUG_ON(tsk != current);
@@ -252,7 +245,7 @@ static void do_signal(struct task_struct *tsk)
get_signal(&ksig);
/* Is there any syscall restart business here ? */
- check_syscall_restart(regs, &ksig.ka, ksig.sig > 0);
+ restart = check_syscall_restart(regs, &ksig.ka, ksig.sig > 0);
if (ksig.sig <= 0) {
/* No signal to deliver -- put the saved sigmask back */
@@ -280,6 +273,17 @@ static void do_signal(struct task_struct *tsk)
rseq_signal_deliver(&ksig, regs);
+ /* Fix the error code which is restored-exact from sigreturn stack */
+ /* syscall_exit_prepare() has similar conversion */
+ if (trap_is_syscall(regs) && !restart &&
+ (unlikely(IS_ERR_VALUE(regs->gpr[3]) && !trap_is_scv(regs)))) {
+ if (likely(!(read_thread_flags() & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
+ regs->gpr[3] = -regs->gpr[3];
+ regs->result = -regs->result;
+ regs->ccr |= 0x10000000; /* Set SO bit in CR */
+ }
+ }
+
if (is_32bit_task()) {
if (ksig.ka.sa.sa_flags & SA_SIGINFO)
ret = handle_rt_signal32(&ksig, oldset, tsk);
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 8c3a73461475..1bda8c9441d3 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1786,6 +1786,22 @@ TEST_F(TRACE_poke, getpid_runs_normally)
# define SYSCALL_RET_SET(_regs, _val) \
do { \
typeof(_val) _result = (_val); \
+ /* \
+ * powerpc sc 0 syscall error return convention \
+ * is r3 = +EERROR and CR0[SO]=1 to flag error. \
+ * The scv 0 error convention is r3 = -EERROR. \
+ * \
+ * The type of syscall can be detected by \
+ * testing _regs.trap & 0xfff0, the value \
+ * 0x0c00 is an sc 0 call, and 0x3000 is scv 0. \
+ * \
+ * However tracing does not need to know the \
+ * difference when injecting an error, as Linux \
+ * will convert -EERROR in r3 to the sc 0 error \
+ * convention as necessary. But the below code \
+ * demonstrates explicitly setting the sc 0 \
+ * error convention. \
+ */ \
if ((_regs.trap & 0xfff0) == 0x3000) { \
/* \
* scv 0 system call uses -ve result \
--
2.47.1
Powered by blists - more mailing lists