[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <6vss7walhjfjmgau5sytf5b3lyjadmfi4seh6amxlthl3sig3b@dpbuhz6ds26y>
Date: Wed, 3 Dec 2025 15:36:08 +0100
From: Mateusz Guzik <mjguzik@...il.com>
To: Gabriel Krisman Bertazi <krisman@...e.de>
Cc: Jan Kara <jack@...e.cz>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, linux-mm@...ck.org, linux-kernel@...r.kernel.org,
Shakeel Butt <shakeel.butt@...ux.dev>, Michal Hocko <mhocko@...nel.org>,
Dennis Zhou <dennis@...nel.org>, Tejun Heo <tj@...nel.org>, Christoph Lameter <cl@...two.org>,
Andrew Morton <akpm@...ux-foundation.org>, David Hildenbrand <david@...hat.com>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, "Liam R. Howlett" <Liam.Howlett@...cle.com>,
Vlastimil Babka <vbabka@...e.cz>, Mike Rapoport <rppt@...nel.org>,
Suren Baghdasaryan <surenb@...gle.com>, Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [RFC PATCH 0/4] Optimize rss_stat initialization/teardown for
single-threaded tasks
On Wed, Dec 03, 2025 at 12:54:34PM +0100, Mateusz Guzik wrote:
> So I got another idea and it boils down to coalescing cid init with
> rss checks on exit.
>
So short version is I implemented a POC and I have the same performance
for single-threaded processes as your patchset when testing on Sapphire
Rapids in an 80-way vm.
Caveats:
- there is a performance bug on the cpu vs rep movsb (see https://lore.kernel.org/all/mwwusvl7jllmck64xczeka42lglmsh7mlthuvmmqlmi5stp3na@raiwozh466wz/), I worked around it like so:
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e20e25b8b16c..1b538f7bbd89 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -189,6 +189,29 @@ ifeq ($(CONFIG_STACKPROTECTOR),y)
endif
endif
+ifdef CONFIG_CC_IS_GCC
+#
+# Inline memcpy and memset handling policy for gcc.
+#
+# For ops of sizes known at compilation time it quickly resorts to issuing rep
+# movsq and stosq. On most uarchs rep-prefixed ops have a significant startup
+# latency and it is faster to issue regular stores (even if in loops) to handle
+# small buffers.
+#
+# This of course comes at an expense in terms of i-cache footprint. bloat-o-meter
+# reported 0.23% increase for enabling these.
+#
+# We inline up to 256 bytes, which in the best case issues few movs, in the
+# worst case creates a 4 * 8 store loop.
+#
+# The upper limit was chosen semi-arbitrarily -- uarchs wildly differ between a
+# threshold past which a rep-prefixed op becomes faster, 256 being the lowest
+# common denominator. Someone(tm) should revisit this from time to time.
+#
+KBUILD_CFLAGS += -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign
+KBUILD_CFLAGS += -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign
+endif
+
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
- qemu version i'm saddled with does not pass FSRS to the guest, thus:
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index fb5a03cf5ab7..a692bb4cece4 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -30,7 +30,7 @@
* which the compiler could/should do much better anyway.
*/
SYM_TYPED_FUNC_START(__memset)
- ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
+// ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
movq %rdi,%r9
movb %sil,%al
Baseline commit (+ the 2 above hacks) is the following:
commit a8ec08bf32595ea4b109e3c7f679d4457d1c58c0
Merge: ed80cc758b78 48233291461b
Author: Vlastimil Babka <vbabka@...e.cz>
Date: Tue Nov 25 14:38:41 2025 +0100
Merge branch 'slab/for-6.19/mempool_alloc_bulk' into slab/for-next
This is what the ctor/dtor branch is rebased on. It is missing some of
the further changes to cid machinery in upstream, but they don't
fundamentally mess with the core idea of the patch (pcpu memory is still
allocated on mm creation and it is being zeroed) so I did not bother
rebasing -- end perf will be the same.
Benchmark is a static binary executing itself in a loop: http://apollo.backplane.com/DFlyMisc/doexec.c
$ cc -O2 -o static-doexec doexec.c
$ taskset --cpu-list 1 ./static-doexec 1
With ctor+dtor+unified walk I'm seeing 2% improvement over the baseline and the same performance as lazy counter.
If nobody is willing to productize this I'm going to do it.
non-production hack below for reference:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cb9c6b16c311..f952ec1f59d1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1439,7 +1439,7 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
return (struct cpumask *)cid_bitmap;
}
-static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+static inline void mm_init_cid_percpu(struct mm_struct *mm, struct task_struct *p)
{
int i;
@@ -1457,6 +1457,15 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
cpumask_clear(mm_cidmask(mm));
}
+static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->nr_cpus_allowed = p->nr_cpus_allowed;
+ atomic_set(&mm->max_nr_cid, 0);
+ raw_spin_lock_init(&mm->cpus_allowed_lock);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ cpumask_clear(mm_cidmask(mm));
+}
+
static inline int mm_alloc_cid_noprof(struct mm_struct *mm)
{
mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
diff --git a/kernel/fork.c b/kernel/fork.c
index a26319cddc3c..1575db9f0198 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -575,21 +575,46 @@ static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
static inline void mm_free_id(struct mm_struct *mm) {}
#endif /* CONFIG_MM_ID */
+/*
+ * pretend this is fully integrated into hotplug support
+ */
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(cpu_hotplug_lock);
+
static void check_mm(struct mm_struct *mm)
{
- int i;
+ long rss_stat[NR_MM_COUNTERS];
+ unsigned cpu_seq;
+ int i, cpu;
BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
"Please make sure 'struct resident_page_types[]' is updated as well");
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = percpu_counter_sum(&mm->rss_stat[i]);
+ cpu_seq = read_seqbegin(&cpu_hotplug_lock);
+ local_irq_disable();
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ rss_stat[i] = mm->rss_stat[i].count;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ pcpu_cid->cid = MM_CID_UNSET;
+ pcpu_cid->recent_cid = MM_CID_UNSET;
+ pcpu_cid->time = 0;
- if (unlikely(x)) {
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ rss_stat[i] += *per_cpu_ptr(mm->rss_stat[i].counters, cpu);
+ }
+ local_irq_enable();
+ if (read_seqretry(&cpu_hotplug_lock, cpu_seq))
+ BUG();
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ if (unlikely(rss_stat[i])) {
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
- mm, resident_page_types[i], x,
+ mm, resident_page_types[i], rss_stat[i],
current->comm,
task_pid_nr(current));
+ /* XXXBUG: ZERO IT OUT */
}
}
@@ -2953,10 +2978,19 @@ static int sighand_ctor(void *data)
static int mm_struct_ctor(void *object)
{
struct mm_struct *mm = object;
+ int cpu;
if (mm_alloc_cid(mm))
return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ pcpu_cid->cid = MM_CID_UNSET;
+ pcpu_cid->recent_cid = MM_CID_UNSET;
+ pcpu_cid->time = 0;
+ }
+
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL,
NR_MM_COUNTERS)) {
mm_destroy_cid(mm);
diff --git a/mm/percpu.c b/mm/percpu.c
index 7d036f42b5af..47e23ea90d7b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1693,7 +1693,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
obj_cgroup_put(objcg);
}
-bool pcpu_charge(void *ptr, size_t size, gfp_t gfp)
+bool pcpu_charge(void __percpu *ptr, size_t size, gfp_t gfp)
{
struct obj_cgroup *objcg = NULL;
void *addr;
@@ -1710,7 +1710,7 @@ bool pcpu_charge(void *ptr, size_t size, gfp_t gfp)
return true;
}
-void pcpu_uncharge(void *ptr, size_t size)
+void pcpu_uncharge(void __percpu *ptr, size_t size)
{
void *addr;
struct pcpu_chunk *chunk;
Powered by blists - more mailing lists