linux-kernel - Re: [patch V3 08/12] rseq: Implement time slice extension enforcement timer

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251029144538.5eb5c772@gandalf.local.home>
Date: Wed, 29 Oct 2025 14:45:38 -0400
From: Steven Rostedt <rostedt@...dmis.org>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: LKML <linux-kernel@...r.kernel.org>, Peter Zijlstra
 <peterz@...radead.org>, Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
 "Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
 Jonathan Corbet <corbet@....net>, Prakash Sangappa
 <prakash.sangappa@...cle.com>, Madadi Vineeth Reddy
 <vineethr@...ux.ibm.com>, K Prateek Nayak <kprateek.nayak@....com>,
 Sebastian Andrzej Siewior <bigeasy@...utronix.de>, Arnd Bergmann
 <arnd@...db.de>, linux-arch@...r.kernel.org
Subject: Re: [patch V3 08/12] rseq: Implement time slice extension
 enforcement timer

On Wed, 29 Oct 2025 14:22:26 +0100 (CET)
Thomas Gleixner <tglx@...utronix.de> wrote:

> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -86,8 +86,24 @@ static __always_inline bool rseq_slice_e
>  {
>  	return static_branch_likely(&rseq_slice_extension_key);
>  }
> +
> +extern unsigned int rseq_slice_ext_nsecs;
> +bool __rseq_arm_slice_extension_timer(void);
> +
> +static __always_inline bool rseq_arm_slice_extension_timer(void)
> +{
> +	if (!rseq_slice_extension_enabled())
> +		return false;
> +
> +	if (likely(!current->rseq.slice.state.granted))
> +		return false;
> +
> +	return __rseq_arm_slice_extension_timer();
> +}
> +
>  #else /* CONFIG_RSEQ_SLICE_EXTENSION */
>  static inline bool rseq_slice_extension_enabled(void) { return false; }
> +static inline bool rseq_arm_slice_extension_timer(void) { return false; }
>  #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
>  
>  bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
> @@ -542,17 +558,19 @@ static __always_inline void clear_tif_rs
>  static __always_inline bool
>  rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
>  {
> -	if (likely(!test_tif_rseq(ti_work)))
> -		return false;
> -
> -	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
> -		current->rseq.event.slowpath = true;
> -		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> -		return true;
> +	if (unlikely(test_tif_rseq(ti_work))) {
> +		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
> +			current->rseq.event.slowpath = true;
> +			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +			return true;

Just to make sure I understand this. By setting TIF_NOTIFY_RESUME and
returning true it can still comeback to set the timer?

I guess this also begs the question of if user space can use both the
restartable sequences at the same time as requesting an extended time slice?

> +		}
> +		clear_tif_rseq();
>  	}
> -
> -	clear_tif_rseq();
> -	return false;
> +	/*
> +	 * Arm the slice extension timer if nothing to do anymore and the
> +	 * task really goes out to user space.
> +	 */
> +	return rseq_arm_slice_extension_timer();
>  }
>  
>  #endif /* CONFIG_GENERIC_ENTRY */
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -89,9 +89,11 @@ union rseq_slice_state {
>  /**
>   * struct rseq_slice - Status information for rseq time slice extension
>   * @state:	Time slice extension state
> + * @expires:	The time when a grant expires
>   */
>  struct rseq_slice {
>  	union rseq_slice_state	state;
> +	u64			expires;
>  };
>  
>  /**
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -71,6 +71,8 @@
>  #define RSEQ_BUILD_SLOW_PATH
>  
>  #include <linux/debugfs.h>
> +#include <linux/hrtimer.h>
> +#include <linux/percpu.h>
>  #include <linux/prctl.h>
>  #include <linux/ratelimit.h>
>  #include <linux/rseq_entry.h>
> @@ -499,8 +501,78 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>  }
>  
>  #ifdef CONFIG_RSEQ_SLICE_EXTENSION
> +struct slice_timer {
> +	struct hrtimer	timer;
> +	void		*cookie;
> +};
> +
> +unsigned int rseq_slice_ext_nsecs __read_mostly = 30 * NSEC_PER_USEC;
> +static DEFINE_PER_CPU(struct slice_timer, slice_timer);
>  DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
>  
> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
> +{
> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
> +
> +	if (st->cookie == current && current->rseq.slice.state.granted) {
> +		rseq_stat_inc(rseq_stats.s_expired);
> +		set_need_resched_current();
> +	}
> +	return HRTIMER_NORESTART;
> +}
> +
> +bool __rseq_arm_slice_extension_timer(void)
> +{
> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> +	struct task_struct *curr = current;
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	/*
> +	 * This check prevents that a granted time slice extension exceeds

           This check prevents a granted time slice ...

> +	 * the maximum scheduling latency when the grant expired before
> +	 * going out to user space. Don't bother to clear the grant here,
> +	 * it will be cleaned up automatically before going out to user
> +	 * space.
> +	 */
> +	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
> +		set_need_resched_current();
> +		return true;
> +	}
> +
> +	/*
> +	 * Store the task pointer as a cookie for comparison in the timer
> +	 * function. This is safe as the timer is CPU local and cannot be
> +	 * in the expiry function at this point.
> +	 */

I'm just curious in this scenario:

  1) Task A requests an extension and is granted.
      st->cookie = Task A
      hrtimer_start();

  2) Before getting back to user space, a RT kernel thread wakes up and
     preempts Task A. Does this clear the timer?

  3) RT kernel thread finishes but then schedules Task B within the expiry.

  4) Task B requests an extension (assuming it had a short time slice that
     allowed it to end before the expiry of the original timer).

I guess it doesn't matter that st->cookie = Task B, as Task A was already
scheduled out. But would calling hrtimer_start() on an existing timer cause
any issue?

I guess it doesn't matter as it looks like the code in hrtimer_start() does
indeed remove an existing timer.

> +	st->cookie = curr;
> +	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
> +	/* Arm the syscall entry work */
> +	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
> +	return false;
> +}
> +
> +static void rseq_cancel_slice_extension_timer(void)
> +{
> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> +
> +	/*
> +	 * st->cookie can be safely read as preemption is disabled and the
> +	 * timer is CPU local.
> +	 *
> +	 * As this is most probably the first expiring timer, the cancel is

           As this is probably the first ...

> +	 * expensive as it has to reprogram the hardware, but that's less
> +	 * expensive than going through a full hrtimer_interrupt() cycle
> +	 * for nothing.
> +	 *
> +	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
> +	 * local and once the hrtimer code disabled interrupts the timer
> +	 * callback cannot be running.
> +	 */
> +	if (st->cookie == current)
> +		hrtimer_try_to_cancel(&st->timer);

If the above scenario did happen, the timer will go off as
st->cookie == current would likely be false?

Hmm, if it does go off and the task did schedule back in, would it get its
need_resched set? This is a very unlikely scenario thus I guess it doesn't
really matter.

I'm just thinking about corner cases and how it could affect this code and
possibly cause noticeable issues.

-- Steve


> +}
> +
>  static inline void rseq_slice_set_need_resched(struct task_struct *curr)
>  {
>  	/*
> @@ -558,10 +630,11 @@ void rseq_syscall_enter_work(long syscal
>  	rseq_stat_inc(rseq_stats.s_yielded);
>  
>  	/*
> -	 * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
> -	 * kernels.
> +	 * Required to stabilize the per CPU timer pointer and to make
> +	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
>  	 */
>  	scoped_guard(preempt) {
> +		rseq_cancel_slice_extension_timer();
>  		/*
>  		 * Now that preemption is disabled, quickly check whether
>  		 * the task was already rescheduled before arriving here.
> @@ -652,6 +725,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_SYSCTL
> +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
> +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
> +
> +static const struct ctl_table rseq_slice_ext_sysctl[] = {
> +	{
> +		.procname	= "rseq_slice_extension_nsec",
> +		.data		= &rseq_slice_ext_nsecs,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_douintvec_minmax,
> +		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
> +		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
> +	},
> +};
> +
> +static void rseq_slice_sysctl_init(void)
> +{
> +	if (rseq_slice_extension_enabled())
> +		register_sysctl_init("kernel", rseq_slice_ext_sysctl);
> +}
> +#else /* CONFIG_SYSCTL */
> +static inline void rseq_slice_sysctl_init(void) { }
> +#endif  /* !CONFIG_SYSCTL */
> +
>  static int __init rseq_slice_cmdline(char *str)
>  {
>  	bool on;
> @@ -664,4 +762,17 @@ static int __init rseq_slice_cmdline(cha
>  	return 1;
>  }
>  __setup("rseq_slice_ext=", rseq_slice_cmdline);
> +
> +static int __init rseq_slice_init(void)
> +{
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
> +			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
> +	}
> +	rseq_slice_sysctl_init();
> +	return 0;
> +}
> +device_initcall(rseq_slice_init);
>  #endif /* CONFIG_RSEQ_SLICE_EXTENSION */