lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aYUOME2XV_ZmRk3F@tardis.local>
Date: Thu, 5 Feb 2026 13:40:00 -0800
From: Boqun Feng <boqun@...nel.org>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Lyude Paul <lyude@...hat.com>, rust-for-linux@...r.kernel.org,
	linux-kernel@...r.kernel.org, Thomas Gleixner <tglx@...utronix.de>,
	Boqun Feng <boqun.feng@...il.com>,
	Daniel Almeida <daniel.almeida@...labora.com>,
	Miguel Ojeda <ojeda@...nel.org>,
	Alex Gaynor <alex.gaynor@...il.com>, Gary Guo <gary@...yguo.net>,
	Björn Roy Baron <bjorn3_gh@...tonmail.com>,
	Benno Lossin <lossin@...nel.org>,
	Andreas Hindborg <a.hindborg@...nel.org>,
	Alice Ryhl <aliceryhl@...gle.com>, Trevor Gross <tmgross@...ch.edu>,
	Danilo Krummrich <dakr@...nel.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Ingo Molnar <mingo@...hat.com>, Will Deacon <will@...nel.org>,
	Waiman Long <longman@...hat.com>,
	Joel Fernandes <joelagnelf@...dia.com>
Subject: Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU
 counter

On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> > But I'm really somewhat sad that 64bit can't do better than this.
> 
> Here, the below builds and boots (albeit with warnings because printf
> format crap sucks).
> 

Thanks! I will drop patch #1 and #2 and use this one (with a commit log
and some more tests), given it's based on the work of Joel, Lyude and
me, would the following tags make sense to all of you?

Co-developed-by: Joel Fernandes <joelagnelf@...dia.com>
Signed-by: Joel Fernandes <joelagnelf@...dia.com>
Co-developed-by: Lyude Paul <lyude@...hat.com>
Signed-off-by: Lyude Paul <lyude@...hat.com>
Co-developed-by: Peter Zijlstra (Intel) <peterz@...radead.org> 
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
Signed-off-by: Boqun Feng <boqun@...nel.org>

Regards,
Boqun

> ---
>  arch/x86/Kconfig               |  1 +
>  arch/x86/include/asm/preempt.h | 53 ++++++++++++++++++++++++++++++------------
>  arch/x86/kernel/cpu/common.c   |  2 +-
>  include/linux/hardirq.h        |  7 +++---
>  include/linux/preempt.h        | 52 ++++++++++++++++++++++++++++++++++-------
>  init/main.c                    |  2 +-
>  kernel/Kconfig.preempt         |  4 ++++
>  kernel/sched/core.c            |  8 +++----
>  kernel/softirq.c               | 10 +++++++-
>  kernel/time/timer.c            |  2 +-
>  lib/locking-selftest.c         |  2 +-
>  11 files changed, 106 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 80527299f859..2bd1972fd4c7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -326,6 +326,7 @@ config X86
>  	select USER_STACKTRACE_SUPPORT
>  	select HAVE_ARCH_KCSAN			if X86_64
>  	select PROC_PID_ARCH_STATUS		if PROC_FS
> +	select PREEMPT_LONG			if X86_64
>  	select HAVE_ARCH_NODE_DEV_GROUP		if X86_SGX
>  	select FUNCTION_ALIGNMENT_16B		if X86_64 || X86_ALIGNMENT_16
>  	select FUNCTION_ALIGNMENT_4B
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 578441db09f0..1b54d5555138 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -7,10 +7,19 @@
>  
>  #include <linux/static_call_types.h>
>  
> -DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
> +DECLARE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count);
>  
> -/* We use the MSB mostly because its available */
> -#define PREEMPT_NEED_RESCHED	0x80000000
> +/*
> + * We use the MSB for PREEMPT_NEED_RESCHED mostly because it is available.
> + */
> +
> +#ifdef CONFIG_64BIT
> +#define PREEMPT_NEED_RESCHED	(~((-1L) >> 1))
> +#define __pc_op(op, ...)	raw_cpu_##op##_8(__VA_ARGS__)
> +#else
> +#define PREEMPT_NEED_RESCHED	(~((-1) >> 1))
> +#define __pc_op(op, ...)	raw_cpu_##op##_4(__VA_ARGS__)
> +#endif
>  
>  /*
>   * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
> @@ -24,18 +33,18 @@ DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
>   */
>  static __always_inline int preempt_count(void)
>  {
> -	return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
> +	return __pc_op(read, __preempt_count) & ~PREEMPT_NEED_RESCHED;
>  }
>  
> -static __always_inline void preempt_count_set(int pc)
> +static __always_inline void preempt_count_set(long pc)
>  {
>  	int old, new;
>  
> -	old = raw_cpu_read_4(__preempt_count);
> +	old = __pc_op(read, __preempt_count);
>  	do {
>  		new = (old & PREEMPT_NEED_RESCHED) |
>  			(pc & ~PREEMPT_NEED_RESCHED);
> -	} while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
> +	} while (!__pc_op(try_cmpxchg, __preempt_count, &old, new));
>  }
>  
>  /*
> @@ -58,33 +67,45 @@ static __always_inline void preempt_count_set(int pc)
>  
>  static __always_inline void set_preempt_need_resched(void)
>  {
> -	raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
> +	__pc_op(and, __preempt_count, ~PREEMPT_NEED_RESCHED);
>  }
>  
>  static __always_inline void clear_preempt_need_resched(void)
>  {
> -	raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
> +	__pc_op(or, __preempt_count, PREEMPT_NEED_RESCHED);
>  }
>  
>  static __always_inline bool test_preempt_need_resched(void)
>  {
> -	return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
> +	return !(__pc_op(read, __preempt_count) & PREEMPT_NEED_RESCHED);
>  }
>  
>  /*
>   * The various preempt_count add/sub methods
>   */
>  
> -static __always_inline void __preempt_count_add(int val)
> +static __always_inline void __preempt_count_add(long val)
>  {
> -	raw_cpu_add_4(__preempt_count, val);
> +	__pc_op(add, __preempt_count, val);
>  }
>  
> -static __always_inline void __preempt_count_sub(int val)
> +static __always_inline void __preempt_count_sub(long val)
>  {
> -	raw_cpu_add_4(__preempt_count, -val);
> +	__pc_op(add, __preempt_count, -val);
>  }
>  
> +#ifdef CONFIG_64BIT
> +static __always_inline void __preempt_count_nmi_enter(void)
> +{
> +	__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);
> +}
> +
> +static __always_inline void __preempt_count_nmi_exit(void)
> +{
> +	__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);
> +}
> +#endif
> +
>  /*
>   * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
>   * a decrement which hits zero means we have no preempt_count and should
> @@ -101,7 +122,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
>   */
>  static __always_inline bool should_resched(int preempt_offset)
>  {
> -	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
> +	return unlikely(__pc_op(read, __preempt_count) == preempt_offset);
>  }
>  
>  #ifdef CONFIG_PREEMPTION
> @@ -148,4 +169,6 @@ do { \
>  
>  #endif /* PREEMPTION */
>  
> +#undef __pc_op
> +
>  #endif /* __ASM_PREEMPT_H */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index e7ab22fce3b5..9d3602f085c9 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2219,7 +2219,7 @@ DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
>  EXPORT_PER_CPU_SYMBOL(current_task);
>  EXPORT_PER_CPU_SYMBOL(const_current_task);
>  
> -DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
> +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count) = INIT_PREEMPT_COUNT;
>  EXPORT_PER_CPU_SYMBOL(__preempt_count);
>  
>  DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index d57cab4d4c06..77defd9624bf 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -108,15 +108,14 @@ void irq_exit_rcu(void);
>  	do {							\
>  		lockdep_off();					\
>  		arch_nmi_enter();				\
> -		BUG_ON(in_nmi() == NMI_MASK);			\
> -		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_enter();			\
>  	} while (0)
>  
>  #define nmi_enter()						\
>  	do {							\
>  		__nmi_enter();					\
>  		lockdep_hardirq_enter();			\
> -		ct_nmi_enter();				\
> +		ct_nmi_enter();					\
>  		instrumentation_begin();			\
>  		ftrace_nmi_enter();				\
>  		instrumentation_end();				\
> @@ -125,7 +124,7 @@ void irq_exit_rcu(void);
>  #define __nmi_exit()						\
>  	do {							\
>  		BUG_ON(!in_nmi());				\
> -		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_exit();			\
>  		arch_nmi_exit();				\
>  		lockdep_on();					\
>  	} while (0)
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index d964f965c8ff..7617ca97f442 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -17,6 +17,9 @@
>   *
>   * - bits 0-7 are the preemption count (max preemption depth: 256)
>   * - bits 8-15 are the softirq count (max # of softirqs: 256)
> + * - bits 16-23 are the hardirq disable count (max # of hardirq disable: 256)
> + * - bits 24-27 are the hardirq count (max # of hardirqs: 16)
> + * - bit 28 is the NMI flag (no nesting count, tracked separately)
>   *
>   * The hardirq count could in theory be the same as the number of
>   * interrupts in the system, but we run all interrupt handlers with
> @@ -24,31 +27,41 @@
>   * there are a few palaeontologic drivers which reenable interrupts in
>   * the handler, so we need more than one bit here.
>   *
> - *         PREEMPT_MASK:	0x000000ff
> - *         SOFTIRQ_MASK:	0x0000ff00
> - *         HARDIRQ_MASK:	0x000f0000
> - *             NMI_MASK:	0x00f00000
> - * PREEMPT_NEED_RESCHED:	0x80000000
> + * NMI nesting depth is tracked in a separate per-CPU variable
> + * (nmi_nesting) to save bits in preempt_count.
> + *
> + *				32bit		64bit + PREEMPT_LONG
> + *
> + *         PREEMPT_MASK:	0x000000ff	0x00000000000000ff
> + *         SOFTIRQ_MASK:	0x0000ff00	0x000000000000ff00
> + * HARDIRQ_DISABLE_MASK:	0x00ff0000	0x0000000000ff0000
> + *         HARDIRQ_MASK:	0x0f000000	0x000000000f000000
> + *             NMI_MASK:	0x10000000	0x00000000f0000000
> + * PREEMPT_NEED_RESCHED:	0x80000000	0x8000000000000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
> +#define HARDIRQ_DISABLE_BITS	8
>  #define HARDIRQ_BITS	4
> -#define NMI_BITS	4
> +#define NMI_BITS	(1 + 3*IS_ENABLED(CONFIG_PREEMPT_LONG))
>  
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
> -#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDIRQ_DISABLE_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDIRQ_SHIFT	(HARDIRQ_DISABLE_SHIFT + HARDIRQ_DISABLE_BITS)
>  #define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
>  
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
>  
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
> +#define HARDIRQ_DISABLE_MASK	(__IRQ_MASK(HARDIRQ_DISABLE_BITS) << HARDIRQ_DISABLE_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
>  #define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
>  
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
> +#define HARDIRQ_DISABLE_OFFSET	(1UL << HARDIRQ_DISABLE_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
>  #define NMI_OFFSET	(1UL << NMI_SHIFT)
>  
> @@ -105,8 +118,8 @@ static __always_inline unsigned char interrupt_context_level(void)
>   * preempt_count() is commonly implemented with READ_ONCE().
>   */
>  
> -#define nmi_count()	(preempt_count() & NMI_MASK)
> -#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
> +#define nmi_count()		(preempt_count() & NMI_MASK)
> +#define hardirq_count()		(preempt_count() & HARDIRQ_MASK)
>  #ifdef CONFIG_PREEMPT_RT
>  # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
>  # define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
> @@ -132,6 +145,27 @@ static __always_inline unsigned char interrupt_context_level(void)
>  # define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
>  #endif
>  
> +#ifndef CONFIG_PREEMPT_LONG
> +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> +
> +#define __preempt_count_nmi_enter()				\
> +	do {							\
> +		unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;	\
> +		__this_cpu_inc(nmi_nesting);			\
> +		_o -= (preempt_count() & NMI_MASK);		\
> +		__preempt_count_add(_o);			\
> +	} while (0)
> +
> +#define __preempt_count_nmi_exit()				\
> +	do {							\
> +		unsigned int _o = HARDIRQ_OFFSET;		\
> +		if (!__this_cpu_dec_return(nmi_nesting))	\
> +			_o += NMI_MASK;				\
> +		__preempt_count_sub(_o);			\
> +	} while (0)
> +
> +#endif
> +
>  /*
>   * The following macros are deprecated and should not be used in new code:
>   * in_softirq()   - We have BH disabled, or are processing softirqs
> diff --git a/init/main.c b/init/main.c
> index b84818ad9685..f8f4b78b7a06 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -1367,7 +1367,7 @@ static inline void do_trace_initcall_level(const char *level)
>  
>  int __init_or_module do_one_initcall(initcall_t fn)
>  {
> -	int count = preempt_count();
> +	long count = preempt_count();
>  	char msgbuf[64];
>  	int ret;
>  
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index 88c594c6d7fc..2ad9365915eb 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -122,6 +122,10 @@ config PREEMPT_RT_NEEDS_BH_LOCK
>  config PREEMPT_COUNT
>         bool
>  
> +config PREEMPT_LONG
> +	bool
> +	depends on PREEMPT_COUNT && 64BIT
> +
>  config PREEMPTION
>         bool
>         select PREEMPT_COUNT
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b411e4feff7f..f54dd3cb66f2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5709,7 +5709,7 @@ static inline void sched_tick_stop(int cpu) { }
>   * If the value passed in is equal to the current preempt count
>   * then we just disabled preemption. Start timing the latency.
>   */
> -static inline void preempt_latency_start(int val)
> +static inline void preempt_latency_start(long val)
>  {
>  	if (preempt_count() == val) {
>  		unsigned long ip = get_lock_parent_ip();
> @@ -5746,7 +5746,7 @@ NOKPROBE_SYMBOL(preempt_count_add);
>   * If the value passed in equals to the current preempt count
>   * then we just enabled preemption. Stop timing the latency.
>   */
> -static inline void preempt_latency_stop(int val)
> +static inline void preempt_latency_stop(long val)
>  {
>  	if (preempt_count() == val)
>  		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
> @@ -8774,7 +8774,7 @@ void __might_sleep(const char *file, int line)
>  }
>  EXPORT_SYMBOL(__might_sleep);
>  
> -static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
> +static void print_preempt_disable_ip(long preempt_offset, unsigned long ip)
>  {
>  	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
>  		return;
> @@ -8846,7 +8846,7 @@ void __might_resched(const char *file, int line, unsigned int offsets)
>  }
>  EXPORT_SYMBOL(__might_resched);
>  
> -void __cant_sleep(const char *file, int line, int preempt_offset)
> +void __cant_sleep(const char *file, int line, long preempt_offset)
>  {
>  	static unsigned long prev_jiffy;
>  
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 77198911b8dd..51a7f391edab 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -88,6 +88,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
>  EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
>  #endif
>  
> +#ifndef CONFIG_PREEMPT_LONG
> +/*
> + * Any 32bit architecture that still cares about performance should
> + * probably ensure this is near preempt_count.
> + */
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> +#endif
> +
>  /*
>   * SOFTIRQ_OFFSET usage:
>   *
> @@ -609,7 +617,7 @@ static void handle_softirqs(bool ksirqd)
>  
>  	while ((softirq_bit = ffs(pending))) {
>  		unsigned int vec_nr;
> -		int prev_count;
> +		long prev_count;
>  
>  		h += softirq_bit - 1;
>  
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 1f2364126894..89c348139218 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -1723,7 +1723,7 @@ static void call_timer_fn(struct timer_list *timer,
>  			  void (*fn)(struct timer_list *),
>  			  unsigned long baseclk)
>  {
> -	int count = preempt_count();
> +	long count = preempt_count();
>  
>  #ifdef CONFIG_LOCKDEP
>  	/*
> diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
> index d939403331b5..8fd216bd0be6 100644
> --- a/lib/locking-selftest.c
> +++ b/lib/locking-selftest.c
> @@ -1429,7 +1429,7 @@ static int unexpected_testcase_failures;
>  
>  static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
>  {
> -	int saved_preempt_count = preempt_count();
> +	long saved_preempt_count = preempt_count();
>  #ifdef CONFIG_PREEMPT_RT
>  #ifdef CONFIG_SMP
>  	int saved_mgd_count = current->migration_disabled;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ