[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250424080755.272925-8-harry.yoo@oracle.com>
Date: Thu, 24 Apr 2025 17:07:55 +0900
From: Harry Yoo <harry.yoo@...cle.com>
To: Vlastimil Babka <vbabka@...e.cz>, Christoph Lameter <cl@...two.org>,
David Rientjes <rientjes@...gle.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Dennis Zhou <dennis@...nel.org>, Tejun Heo <tj@...nel.org>,
Mateusz Guzik <mjguzik@...il.com>
Cc: Jamal Hadi Salim <jhs@...atatu.com>, Cong Wang <xiyou.wangcong@...il.com>,
Jiri Pirko <jiri@...nulli.us>, Vlad Buslov <vladbu@...dia.com>,
Yevgeny Kliteynik <kliteyn@...dia.com>, Jan Kara <jack@...e.cz>,
Byungchul Park <byungchul@...com>, linux-mm@...ck.org,
netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
Harry Yoo <harry.yoo@...cle.com>
Subject: [RFC PATCH 7/7] kernel/fork: improve exec() throughput with slab ctor/dtor pair
When initializing newly allocated mm_struct, mm_init() allocate
two chunks of percpu memory (pcpu_cid and rss_stat). Because percpu
memory allocator uses a global mutex (pcpu_alloc_mutex), it becomes
a global serialization point for exec().
Use slab ctor/dtor pair to allocate and free percpu memory (pcpu_cid,
rss_stat) for mm_struct. mm_init_cid() is moved to mm_init(), and rss_stat
percpu counter is charged in mm_init() and uncharged in __mmdrop().
As rss_stat and pcpu_cid fields should not be overwritten during
memset() by mm_init() and memcy() by dup_mm(), move those fields to the
end of mm_struct. Any field defined after 'ctor_fields_offset' won't be
overwritten by these helpers.
On the other hand, while cpu_bitmap[] is not initialized by
the constructor, it should always be at the end of mm_struct.
However, as cpu_bitmap[] is always initialized by mm_init(),
not calling memset() and memcpy() for this field does not affect its
current behavior.
Note that check_mm() validates whether any rss counter is nonzero
and reports an error if any nonzero value is found. In other words,
under normal conditions, the counters should always be zero when
an mm_struct is freed. Therefore is not necessary to reset the counters
in mm_init() once they have been initialized by the constructor.
To measure the performance impact, I ran the exec() benchmark [1], which
launches one process per CPU and each proess repeatedly invokes exec().
On a desktop with 12 cores (24 hardware threads), it raises
exec() throughput by an average of 4.56%. Even in a single-threaded run
I see roughly a 4% gain, showing that the cost of acquiring and releasing
pcpu_alloc_mutex is significant even when it is uncontended.
On a dual-socket server with 192 cores the mutex becomes a contention
hotpot; mitigating that contention boosts exec() throughput by 33%.
Link: http://apollo.backplane.com/DFlyMisc/doexec.c [1]
Link: https://lore.kernel.org/linux-mm/CAGudoHFc+Km-3usiy4Wdm1JkM+YjCgD9A8dDKQ06pZP070f1ig@mail.gmail.com
Suggested-by: Mateusz Guzik <mjguzik@...il.com>
Signed-off-by: Harry Yoo <harry.yoo@...cle.com>
---
include/linux/mm_types.h | 40 +++++++++++++++++---------
kernel/fork.c | 62 +++++++++++++++++++++++++++-------------
2 files changed, 69 insertions(+), 33 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 56d07edd01f9..3000ca47b8ba 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -946,14 +946,6 @@ struct mm_struct {
atomic_t mm_users;
#ifdef CONFIG_SCHED_MM_CID
- /**
- * @pcpu_cid: Per-cpu current cid.
- *
- * Keep track of the currently allocated mm_cid for each cpu.
- * The per-cpu mm_cid values are serialized by their respective
- * runqueue locks.
- */
- struct mm_cid __percpu *pcpu_cid;
/*
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
*
@@ -982,6 +974,7 @@ struct mm_struct {
* mm nr_cpus_allowed updates.
*/
raw_spinlock_t cpus_allowed_lock;
+ unsigned long _padding; /* for optimal offset of mmap_lock */
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -1059,8 +1052,6 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
- struct percpu_counter rss_stat[NR_MM_COUNTERS];
-
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
@@ -1169,6 +1160,30 @@ struct mm_struct {
#endif /* CONFIG_MM_ID */
} __randomize_layout;
+ /*
+ * The fields below are not initialized by memset() or copied
+ * by memcpy(), in order to avoid overwriting values that are
+ * initialized by the slab constructor.
+ *
+ * The last field, cpu_bitmap, is an exception. This field is not
+ * initialized by the constructor and is always initialized by
+ * the mm_init() function.
+ */
+ union {
+ unsigned long ctor_fields_offset;
+ struct percpu_counter rss_stat[NR_MM_COUNTERS];
+ };
+#ifdef CONFIG_SCHED_MM_CID
+ /**
+ * @pcpu_cid: Per-cpu current cid.
+ *
+ * Keep track of the currently allocated mm_cid for each cpu.
+ * The per-cpu mm_cid values are serialized by their respective
+ * runqueue locks.
+ */
+ struct mm_cid __percpu *pcpu_cid;
+#endif
+
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
@@ -1348,12 +1363,11 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
cpumask_clear(mm_cidmask(mm));
}
-static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
+static inline int mm_alloc_cid_noprof(struct mm_struct *mm)
{
mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
if (!mm->pcpu_cid)
return -ENOMEM;
- mm_init_cid(mm, p);
return 0;
}
#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
@@ -1383,7 +1397,7 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
-static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
+static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
static inline unsigned int mm_cid_size(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 7966b0876dc3..5940cf37379c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -943,8 +943,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
- mm_destroy_cid(mm);
- percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+ percpu_counter_uncharge_many(mm->rss_stat, NR_MM_COUNTERS);
free_mm(mm);
}
@@ -1295,7 +1294,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->map_count = 0;
mm->locked_vm = 0;
atomic64_set(&mm->pinned_vm, 0);
- memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
@@ -1328,21 +1326,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
- if (mm_alloc_cid(mm, p))
- goto fail_cid;
-
- if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
- NR_MM_COUNTERS))
- goto fail_pcpu;
+ if (!percpu_counter_charge_many(mm->rss_stat, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto failed_charge;
+ mm_init_cid(mm, p);
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
-fail_pcpu:
+failed_charge:
mm_destroy_cid(mm);
-fail_cid:
- destroy_context(mm);
fail_nocontext:
mm_free_id(mm);
fail_noid:
@@ -1363,7 +1357,7 @@ struct mm_struct *mm_alloc(void)
if (!mm)
return NULL;
- memset(mm, 0, sizeof(*mm));
+ memset(mm, 0, offsetof(struct mm_struct, ctor_fields_offset));
return mm_init(mm, current, current_user_ns());
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
@@ -1725,7 +1719,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
if (!mm)
goto fail_nomem;
- memcpy(mm, oldmm, sizeof(*mm));
+ memcpy(mm, oldmm, offsetof(struct mm_struct, ctor_fields_offset));
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
@@ -3193,9 +3187,40 @@ static int sighand_ctor(void *data)
return 0;
}
+static int mm_struct_ctor(void *object)
+{
+ struct mm_struct *mm = object;
+
+ if (mm_alloc_cid(mm))
+ return -ENOMEM;
+
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL,
+ NR_MM_COUNTERS)) {
+ mm_destroy_cid(mm);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mm_struct_dtor(void *object)
+{
+ struct mm_struct *mm = object;
+
+ mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+}
+
void __init mm_cache_init(void)
{
unsigned int mm_size;
+ struct kmem_cache_args kmem_args = {
+ .align = ARCH_MIN_MMSTRUCT_ALIGN,
+ .useroffset = offsetof(struct mm_struct, saved_auxv),
+ .usersize = sizeof_field(struct mm_struct, saved_auxv),
+ .ctor = mm_struct_ctor,
+ .dtor = mm_struct_dtor,
+ };
/*
* The mm_cpumask is located at the end of mm_struct, and is
@@ -3204,12 +3229,9 @@ void __init mm_cache_init(void)
*/
mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
+ mm_cachep = kmem_cache_create("mm_struct", mm_size, &kmem_args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|
+ SLAB_ACCOUNT);
}
void __init proc_caches_init(void)
--
2.43.0
Powered by blists - more mailing lists