[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZsMGRIec1y8hdKRG@J2N7QTR9R3.cambridge.arm.com>
Date: Mon, 19 Aug 2024 09:45:56 +0100
From: Mark Rutland <mark.rutland@....com>
To: cl@...two.org
Cc: Catalin Marinas <catalin.marinas@....com>,
Will Deacon <will@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>, Waiman Long <longman@...hat.com>,
Boqun Feng <boqun.feng@...il.com>,
Linus Torvalds <torvalds@...ux-foundation.org>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, linux-arm-kernel@...ts.infradead.org
Subject: Re: [PATCH RFC] Avoid memory barrier in read_seqcount() through load
acquire
On Tue, Aug 13, 2024 at 11:26:17AM -0700, Christoph Lameter via B4 Relay wrote:
> From: "Christoph Lameter (Ampere)" <cl@...two.org>
>
> Some architectures support load acquire which can save us a memory
> barrier and save some cycles.
>
> A typical sequence
>
> do {
> seq = read_seqcount_begin(&s);
> <something>
> } while (read_seqcount_retry(&s, seq);
>
> requires 13 cycles on ARM64 for an empty loop. Two read memory barriers are
> needed. One for each of the seqcount_* functions.
>
> We can replace the first read barrier with a load acquire of
> the seqcount which saves us one barrier.
>
> On ARM64 doing so reduces the cycle count from 13 to 8.
The patch itself looks reasonable to me, but in the commit message could
you please replace "ARM64" with the specific micro-architecture you're
testing on? There are tens of contemporary micro-architectures, and for
future reference it'd be good to know what specifically you've tested
on.
If you cannot disclose that for some reason, just say "on my ARM64 test
machine" or something like that, so that we're not implying that this is
true for all ARM64 implementations.
Mark.
>
> Signed-off-by: Christoph Lameter (Ampere) <cl@...two.org>
> ---
> arch/Kconfig | 5 +++++
> arch/arm64/Kconfig | 1 +
> include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 47 insertions(+)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 975dd22a2dbd..3f8867110a57 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -1600,6 +1600,11 @@ config ARCH_HAS_KERNEL_FPU_SUPPORT
> Architectures that select this option can run floating-point code in
> the kernel, as described in Documentation/core-api/floating-point.rst.
>
> +config ARCH_HAS_ACQUIRE_RELEASE
> + bool
> + help
> + Architectures that support acquire / release can avoid memory fences
> +
> source "kernel/gcov/Kconfig"
>
> source "scripts/gcc-plugins/Kconfig"
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index a2f8ff354ca6..19e34fff145f 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -39,6 +39,7 @@ config ARM64
> select ARCH_HAS_PTE_DEVMAP
> select ARCH_HAS_PTE_SPECIAL
> select ARCH_HAS_HW_PTE_YOUNG
> + select ARCH_HAS_ACQUIRE_RELEASE
> select ARCH_HAS_SETUP_DMA_OPS
> select ARCH_HAS_SET_DIRECT_MAP
> select ARCH_HAS_SET_MEMORY
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index d90d8ee29d81..353fcf32b800 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
> return seq; \
> } \
> \
> +static __always_inline unsigned \
> +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
> +{ \
> + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
> + \
> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
> + return seq; \
> + \
> + if (preemptible && unlikely(seq & 1)) { \
> + __SEQ_LOCK(lockbase##_lock(s->lock)); \
> + __SEQ_LOCK(lockbase##_unlock(s->lock)); \
> + \
> + /* \
> + * Re-read the sequence counter since the (possibly \
> + * preempted) writer made progress. \
> + */ \
> + seq = smp_load_acquire(&s->seqcount.sequence); \
> + } \
> + \
> + return seq; \
> +} \
> + \
> static __always_inline bool \
> __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
> { \
> @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s)
> return READ_ONCE(s->sequence);
> }
>
> +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
> +{
> + return smp_load_acquire(&s->sequence);
> +}
> +
> static inline bool __seqprop_preemptible(const seqcount_t *s)
> {
> return false;
> @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> #define seqprop_ptr(s) __seqprop(s, ptr)(s)
> #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
> #define seqprop_sequence(s) __seqprop(s, sequence)(s)
> +#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s)
> #define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
> #define seqprop_assert(s) __seqprop(s, assert)(s)
>
> @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> *
> * Return: count to be passed to read_seqcount_retry()
> */
> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> +#define raw_read_seqcount_begin(s) \
> +({ \
> + unsigned _seq; \
> + \
> + while ((_seq = seqprop_sequence_acquire(s)) & 1) \
> + cpu_relax(); \
> + \
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
> + _seq; \
> +})
> +#else
> #define raw_read_seqcount_begin(s) \
> ({ \
> unsigned _seq = __read_seqcount_begin(s); \
> @@ -300,6 +340,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> smp_rmb(); \
> _seq; \
> })
> +#endif
>
> /**
> * read_seqcount_begin() - begin a seqcount_t read critical section
>
> ---
> base-commit: 6b4aa469f04999c3f244515fa7491b4d093c5167
> change-id: 20240813-seq_optimize-68c48696c798
>
> Best regards,
> --
> Christoph Lameter <cl@...two.org>
>
>
>
Powered by blists - more mailing lists