lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a406618d-641e-4e8c-afc5-7c4f0db42b1f@kernel.org>
Date: Tue, 16 Dec 2025 12:01:32 +0100
From: "Christophe Leroy (CS GROUP)" <chleroy@...nel.org>
To: Mukesh Kumar Chaurasiya <mkchauras@...ux.ibm.com>, maddy@...ux.ibm.com,
 mpe@...erman.id.au, npiggin@...il.com, oleg@...hat.com, kees@...nel.org,
 luto@...capital.net, wad@...omium.org, mchauras@...ux.ibm.com,
 thuth@...hat.com, sshegde@...ux.ibm.com, charlie@...osinc.com,
 macro@...am.me.uk, akpm@...ux-foundation.org, ldv@...ace.io, deller@....de,
 ankur.a.arora@...cle.com, segher@...nel.crashing.org, tglx@...utronix.de,
 thomas.weissschuh@...utronix.de, peterz@...radead.org,
 menglong8.dong@...il.com, bigeasy@...utronix.de, namcao@...utronix.de,
 kan.liang@...ux.intel.com, mingo@...nel.org, atrajeev@...ux.vnet.ibm.com,
 mark.barnett@....com, linuxppc-dev@...ts.ozlabs.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 8/8] powerpc: Enable Generic Entry/Exit for syscalls.



Le 14/12/2025 à 14:02, Mukesh Kumar Chaurasiya a écrit :
> From: Mukesh Kumar Chaurasiya <mchauras@...ux.ibm.com>
> 
> Convert the PowerPC syscall entry and exit paths to use the generic
> entry/exit framework by selecting GENERIC_ENTRY and integrating with
> the common syscall handling routines.
> 
> This change transitions PowerPC away from its custom syscall entry and
> exit code to use the generic helpers such as:
>   - syscall_enter_from_user_mode()
>   - syscall_exit_to_user_mode()
> 
> As part of this migration:
>   - The architecture now selects GENERIC_ENTRY in Kconfig.
>   - Old tracing, seccomp, and audit handling in ptrace.c is removed in
>     favor of generic entry infrastructure.
>   - interrupt.c and syscall.c are simplified to delegate context
>     management and user exit handling to the generic entry path.
>   - The new pt_regs field `exit_flags` introduced earlier is now used
>     to carry per-syscall exit state flags (e.g. _TIF_RESTOREALL).
> 
> This aligns PowerPC with the common entry code used by other
> architectures and reduces duplicated logic around syscall tracing,
> context tracking, and signal handling.
> 
> The performance benchmarks from perf bench basic syscall are below:
> 
> perf bench syscall usec/op
> 
> | Test            | With Patch | Without Patch | % Change |
> | --------------- | ---------- | ------------- | -------- |
> | getppid usec/op | 0.207795   | 0.210373      | -1.22%   |
> | getpgid usec/op | 0.206282   | 0.211676      | -2.55%   |
> | fork usec/op    | 833.986    | 814.809       | +2.35%   |
> | execve usec/op  | 360.939    | 365.168       | -1.16%   |
> 
> perf bench syscall ops/sec
> 
> | Test            | With Patch | Without Patch | % Change |
> | --------------- | ---------- | ------------- | -------- |
> | getppid ops/sec | 48,12,433  | 47,53,459     | +1.24%   |
> | getpgid ops/sec | 48,47,744  | 47,24,192     | +2.61%   |
> | fork ops/sec    | 1,199      | 1,227         | -2.28%   |
> | execve ops/sec  | 2,770      | 2,738         | +1.16%   |
> 
> IPI latency benchmark
> 
> | Metric                  | With Patch       | Without Patch    | % Change |
> | ----------------------- | ---------------- | ---------------- | -------- |
> | Dry-run (ns)            | 206,675.81       | 206,719.36       | -0.02%   |
> | Self-IPI avg (ns)       | 1,939,991.00     | 1,976,116.15     | -1.83%   |
> | Self-IPI max (ns)       | 3,533,718.93     | 3,582,650.33     | -1.37%   |
> | Normal IPI avg (ns)     | 111,110,034.23   | 110,513,373.51   | +0.54%   |
> | Normal IPI max (ns)     | 150,393,442.64   | 149,669,477.89   | +0.48%   |
> | Broadcast IPI max (ns)  | 3,978,231,022.96 | 4,359,916,859.46 | -8.73%   |
> | Broadcast lock max (ns) | 4,025,425,714.49 | 4,384,956,730.83 | -8.20%   |
> 
> Thats very close to performance earlier with arch specific handling.
> 
> Signed-off-by: Mukesh Kumar Chaurasiya <mchauras@...ux.ibm.com>
> ---
>   arch/powerpc/Kconfig                    |   1 +
>   arch/powerpc/include/asm/entry-common.h |   5 +-
>   arch/powerpc/kernel/interrupt.c         | 139 +++++++----------------
>   arch/powerpc/kernel/ptrace/ptrace.c     | 141 ------------------------
>   arch/powerpc/kernel/signal.c            |  10 +-
>   arch/powerpc/kernel/syscall.c           | 119 +-------------------
>   6 files changed, 49 insertions(+), 366 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index b0c602c3bbe1..a4330775b254 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -203,6 +203,7 @@ config PPC
>   	select GENERIC_CPU_AUTOPROBE
>   	select GENERIC_CPU_VULNERABILITIES	if PPC_BARRIER_NOSPEC
>   	select GENERIC_EARLY_IOREMAP
> +	select GENERIC_ENTRY
>   	select GENERIC_GETTIMEOFDAY
>   	select GENERIC_IDLE_POLL_SETUP
>   	select GENERIC_IOREMAP
> diff --git a/arch/powerpc/include/asm/entry-common.h b/arch/powerpc/include/asm/entry-common.h
> index e2ae7416dee1..77129174f882 100644
> --- a/arch/powerpc/include/asm/entry-common.h
> +++ b/arch/powerpc/include/asm/entry-common.h
> @@ -3,7 +3,7 @@
>   #ifndef _ASM_PPC_ENTRY_COMMON_H
>   #define _ASM_PPC_ENTRY_COMMON_H
>   
> -#ifdef CONFIG_GENERIC_IRQ_ENTRY
> +#ifdef CONFIG_GENERIC_ENTRY

Powerpc now selected this inconditionaly. Why is this #ifdef needed ?


>   
>   #include <asm/cputime.h>
>   #include <asm/interrupt.h>
> @@ -217,9 +217,6 @@ static inline void arch_interrupt_enter_prepare(struct pt_regs *regs)
>   
>   	if (user_mode(regs)) {
>   		kuap_lock();
> -		CT_WARN_ON(ct_state() != CT_STATE_USER);
> -		user_exit_irqoff();
> -
>   		account_cpu_user_entry();
>   		account_stolen_time();
>   	} else {
> diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
> index 7f67f0b9d627..7d5cd4b5a610 100644
> --- a/arch/powerpc/kernel/interrupt.c
> +++ b/arch/powerpc/kernel/interrupt.c
> @@ -1,6 +1,7 @@
>   // SPDX-License-Identifier: GPL-2.0-or-later
>   
>   #include <linux/context_tracking.h>
> +#include <linux/entry-common.h>
>   #include <linux/err.h>
>   #include <linux/compat.h>
>   #include <linux/rseq.h>
> @@ -73,79 +74,6 @@ static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable)
>   	return true;
>   }
>   
> -static notrace unsigned long
> -interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
> -{
> -	unsigned long ti_flags;
> -
> -again:
> -	ti_flags = read_thread_flags();
> -	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
> -		local_irq_enable();
> -		if (ti_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> -			schedule();
> -		} else {
> -			/*
> -			 * SIGPENDING must restore signal handler function
> -			 * argument GPRs, and some non-volatiles (e.g., r1).
> -			 * Restore all for now. This could be made lighter.
> -			 */
> -			if (ti_flags & _TIF_SIGPENDING)
> -				ret |= _TIF_RESTOREALL;
> -			do_notify_resume(regs, ti_flags);

do_notify_resume() has no caller anymore, should be removed from 
arch/powerpc/include/asm/signal.h and arch/powerpc/kernel/signal.c



> -		}
> -		local_irq_disable();
> -		ti_flags = read_thread_flags();
> -	}
> -
> -	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
> -		if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
> -				unlikely((ti_flags & _TIF_RESTORE_TM))) {
> -			restore_tm_state(regs);
> -		} else {
> -			unsigned long mathflags = MSR_FP;
> -
> -			if (cpu_has_feature(CPU_FTR_VSX))
> -				mathflags |= MSR_VEC | MSR_VSX;
> -			else if (cpu_has_feature(CPU_FTR_ALTIVEC))
> -				mathflags |= MSR_VEC;
> -
> -			/*
> -			 * If userspace MSR has all available FP bits set,
> -			 * then they are live and no need to restore. If not,
> -			 * it means the regs were given up and restore_math
> -			 * may decide to restore them (to avoid taking an FP
> -			 * fault).
> -			 */
> -			if ((regs->msr & mathflags) != mathflags)
> -				restore_math(regs);
> -		}
> -	}
> -
> -	check_return_regs_valid(regs);
> -
> -	user_enter_irqoff();
> -	if (!prep_irq_for_enabled_exit(true)) {
> -		user_exit_irqoff();
> -		local_irq_enable();
> -		local_irq_disable();
> -		goto again;
> -	}
> -
> -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> -	local_paca->tm_scratch = regs->msr;
> -#endif
> -
> -	booke_load_dbcr0();
> -
> -	account_cpu_user_exit();
> -
> -	/* Restore user access locks last */
> -	kuap_user_restore(regs);
> -
> -	return ret;
> -}
> -
>   /*
>    * This should be called after a syscall returns, with r3 the return value
>    * from the syscall. If this function returns non-zero, the system call
> @@ -160,17 +88,12 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>   					   long scv)
>   {
>   	unsigned long ti_flags;
> -	unsigned long ret = 0;
>   	bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv;
>   
> -	CT_WARN_ON(ct_state() == CT_STATE_USER);
> -
>   	kuap_assert_locked();
>   
>   	regs->result = r3;
> -
> -	/* Check whether the syscall is issued inside a restartable sequence */
> -	rseq_syscall(regs);
> +	regs->exit_flags = 0;
>   
>   	ti_flags = read_thread_flags();
>   
> @@ -183,7 +106,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>   
>   	if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
>   		if (ti_flags & _TIF_RESTOREALL)
> -			ret = _TIF_RESTOREALL;
> +			regs->exit_flags = _TIF_RESTOREALL;
>   		else
>   			regs->gpr[3] = r3;
>   		clear_bits(_TIF_PERSYSCALL_MASK, &current_thread_info()->flags);
> @@ -192,18 +115,28 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>   	}
>   
>   	if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
> -		do_syscall_trace_leave(regs);
> -		ret |= _TIF_RESTOREALL;
> +		regs->exit_flags |= _TIF_RESTOREALL;
>   	}
>   
> -	local_irq_disable();
> -	ret = interrupt_exit_user_prepare_main(ret, regs);
> +	syscall_exit_to_user_mode(regs);
> +
> +again:
> +	user_enter_irqoff();
> +	if (!prep_irq_for_enabled_exit(true)) {
> +		user_exit_irqoff();
> +		local_irq_enable();
> +		local_irq_disable();
> +		goto again;
> +	}
> +
> +	/* Restore user access locks last */
> +	kuap_user_restore(regs);
>   
>   #ifdef CONFIG_PPC64
> -	regs->exit_result = ret;
> +	regs->exit_result = regs->exit_flags;
>   #endif
>   
> -	return ret;
> +	return regs->exit_flags;
>   }
>   
>   #ifdef CONFIG_PPC64
> @@ -223,13 +156,16 @@ notrace unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *reg
>   	set_kuap(AMR_KUAP_BLOCKED);
>   #endif
>   
> -	trace_hardirqs_off();
> -	user_exit_irqoff();
> -	account_cpu_user_entry();
> -
> -	BUG_ON(!user_mode(regs));
> +again:
> +	user_enter_irqoff();
> +	if (!prep_irq_for_enabled_exit(true)) {
> +		user_exit_irqoff();
> +		local_irq_enable();
> +		local_irq_disable();
> +		goto again;
> +	}
>   
> -	regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, regs);
> +	regs->exit_result |= regs->exit_flags;
>   
>   	return regs->exit_result;
>   }
> @@ -241,7 +177,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
>   
>   	BUG_ON(regs_is_unrecoverable(regs));
>   	BUG_ON(regs_irqs_disabled(regs));
> -	CT_WARN_ON(ct_state() == CT_STATE_USER);
>   
>   	/*
>   	 * We don't need to restore AMR on the way back to userspace for KUAP.
> @@ -250,8 +185,21 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
>   	kuap_assert_locked();
>   
>   	local_irq_disable();
> +	regs->exit_flags = 0;
> +again:
> +	check_return_regs_valid(regs);
> +	user_enter_irqoff();
> +	if (!prep_irq_for_enabled_exit(true)) {
> +		user_exit_irqoff();
> +		local_irq_enable();
> +		local_irq_disable();
> +		goto again;
> +	}
> +
> +	/* Restore user access locks last */
> +	kuap_user_restore(regs);
>   
> -	ret = interrupt_exit_user_prepare_main(0, regs);
> +	ret = regs->exit_flags;
>   
>   #ifdef CONFIG_PPC64
>   	regs->exit_result = ret;
> @@ -293,8 +241,6 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
>   		/* Returning to a kernel context with local irqs enabled. */
>   		WARN_ON_ONCE(!(regs->msr & MSR_EE));
>   again:
> -		if (need_irq_preemption())
> -			irqentry_exit_cond_resched();
>   
>   		check_return_regs_valid(regs);
>   
> @@ -364,7 +310,6 @@ notrace unsigned long interrupt_exit_user_restart(struct pt_regs *regs)
>   #endif
>   
>   	trace_hardirqs_off();
> -	user_exit_irqoff();
>   	account_cpu_user_entry();
>   
>   	BUG_ON(!user_mode(regs));
> diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
> index 2134b6d155ff..316d4f5ead8e 100644
> --- a/arch/powerpc/kernel/ptrace/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> @@ -21,9 +21,6 @@
>   #include <asm/switch_to.h>
>   #include <asm/debug.h>
>   
> -#define CREATE_TRACE_POINTS
> -#include <trace/events/syscalls.h>
> -
>   #include "ptrace-decl.h"
>   
>   /*
> @@ -195,144 +192,6 @@ long arch_ptrace(struct task_struct *child, long request,
>   	return ret;
>   }
>   
> -#ifdef CONFIG_SECCOMP
> -static int do_seccomp(struct pt_regs *regs)
> -{
> -	if (!test_thread_flag(TIF_SECCOMP))
> -		return 0;
> -
> -	/*
> -	 * The ABI we present to seccomp tracers is that r3 contains
> -	 * the syscall return value and orig_gpr3 contains the first
> -	 * syscall parameter. This is different to the ptrace ABI where
> -	 * both r3 and orig_gpr3 contain the first syscall parameter.
> -	 */
> -	regs->gpr[3] = -ENOSYS;
> -
> -	/*
> -	 * We use the __ version here because we have already checked
> -	 * TIF_SECCOMP. If this fails, there is nothing left to do, we
> -	 * have already loaded -ENOSYS into r3, or seccomp has put
> -	 * something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
> -	 */
> -	if (__secure_computing())
> -		return -1;
> -
> -	/*
> -	 * The syscall was allowed by seccomp, restore the register
> -	 * state to what audit expects.
> -	 * Note that we use orig_gpr3, which means a seccomp tracer can
> -	 * modify the first syscall parameter (in orig_gpr3) and also
> -	 * allow the syscall to proceed.
> -	 */
> -	regs->gpr[3] = regs->orig_gpr3;
> -
> -	return 0;
> -}
> -#else
> -static inline int do_seccomp(struct pt_regs *regs) { return 0; }
> -#endif /* CONFIG_SECCOMP */
> -
> -/**
> - * do_syscall_trace_enter() - Do syscall tracing on kernel entry.
> - * @regs: the pt_regs of the task to trace (current)
> - *
> - * Performs various types of tracing on syscall entry. This includes seccomp,
> - * ptrace, syscall tracepoints and audit.
> - *
> - * The pt_regs are potentially visible to userspace via ptrace, so their
> - * contents is ABI.
> - *
> - * One or more of the tracers may modify the contents of pt_regs, in particular
> - * to modify arguments or even the syscall number itself.
> - *
> - * It's also possible that a tracer can choose to reject the system call. In
> - * that case this function will return an illegal syscall number, and will put
> - * an appropriate return value in regs->r3.
> - *
> - * Return: the (possibly changed) syscall number.
> - */
> -long do_syscall_trace_enter(struct pt_regs *regs)

Remove prototype from arch/powerpc/include/asm/ptrace.h

> -{
> -	u32 flags;
> -
> -	flags = read_thread_flags() & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE);
> -
> -	if (flags) {
> -		int rc = ptrace_report_syscall_entry(regs);
> -
> -		if (unlikely(flags & _TIF_SYSCALL_EMU)) {
> -			/*
> -			 * A nonzero return code from
> -			 * ptrace_report_syscall_entry() tells us to prevent
> -			 * the syscall execution, but we are not going to
> -			 * execute it anyway.
> -			 *
> -			 * Returning -1 will skip the syscall execution. We want
> -			 * to avoid clobbering any registers, so we don't goto
> -			 * the skip label below.
> -			 */
> -			return -1;
> -		}
> -
> -		if (rc) {
> -			/*
> -			 * The tracer decided to abort the syscall. Note that
> -			 * the tracer may also just change regs->gpr[0] to an
> -			 * invalid syscall number, that is handled below on the
> -			 * exit path.
> -			 */
> -			goto skip;
> -		}
> -	}
> -
> -	/* Run seccomp after ptrace; allow it to set gpr[3]. */
> -	if (do_seccomp(regs))
> -		return -1;
> -
> -	/* Avoid trace and audit when syscall is invalid. */
> -	if (regs->gpr[0] >= NR_syscalls)
> -		goto skip;
> -
> -	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
> -		trace_sys_enter(regs, regs->gpr[0]);
> -
> -	if (!is_32bit_task())
> -		audit_syscall_entry(regs->gpr[0], regs->gpr[3], regs->gpr[4],
> -				    regs->gpr[5], regs->gpr[6]);
> -	else
> -		audit_syscall_entry(regs->gpr[0],
> -				    regs->gpr[3] & 0xffffffff,
> -				    regs->gpr[4] & 0xffffffff,
> -				    regs->gpr[5] & 0xffffffff,
> -				    regs->gpr[6] & 0xffffffff);
> -
> -	/* Return the possibly modified but valid syscall number */
> -	return regs->gpr[0];
> -
> -skip:
> -	/*
> -	 * If we are aborting explicitly, or if the syscall number is
> -	 * now invalid, set the return value to -ENOSYS.
> -	 */
> -	regs->gpr[3] = -ENOSYS;
> -	return -1;
> -}
> -
> -void do_syscall_trace_leave(struct pt_regs *regs)
> -{
> -	int step;
> -
> -	audit_syscall_exit(regs);
> -
> -	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
> -		trace_sys_exit(regs, regs->result);
> -
> -	step = test_thread_flag(TIF_SINGLESTEP);
> -	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
> -		ptrace_report_syscall_exit(regs, step);
> -}
> -
>   void __init pt_regs_check(void);
>   
>   /*
> diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
> index 719930cf4ae1..9f1847b4742e 100644
> --- a/arch/powerpc/kernel/signal.c
> +++ b/arch/powerpc/kernel/signal.c
> @@ -6,6 +6,7 @@
>    *    Extracted from signal_32.c and signal_64.c
>    */
>   
> +#include <linux/entry-common.h>
>   #include <linux/resume_user_mode.h>
>   #include <linux/signal.h>
>   #include <linux/uprobes.h>
> @@ -22,11 +23,6 @@
>   
>   #include "signal.h"
>   
> -/* This will be removed */
> -#ifdef CONFIG_GENERIC_ENTRY
> -#include <linux/entry-common.h>
> -#endif /* CONFIG_GENERIC_ENTRY */
> -

Until now CONFIG_GENERIC_ENTRY was not defined.

Now that it is defined, we remove the entire block ?

Then why has it been added at all ?

>   #ifdef CONFIG_VSX
>   unsigned long copy_fpr_to_user(void __user *to,
>   			       struct task_struct *task)
> @@ -374,11 +370,9 @@ void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
>   				   task_pid_nr(tsk), where, ptr, regs->nip, regs->link);
>   }
>   
> -#ifdef CONFIG_GENERIC_ENTRY
>   void arch_do_signal_or_restart(struct pt_regs *regs)
>   {
>   	BUG_ON(regs != current->thread.regs);
> -	local_paca->generic_fw_flags |= GFW_RESTORE_ALL;

Why was that there ? I thought it was preparatory, then you remove it 
before even using it ?

> +	regs->exit_flags |= _TIF_RESTOREALL;
>   	do_signal(current);
>   }
> -#endif /* CONFIG_GENERIC_ENTRY */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ