[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <ek7v5vxht56g77kbojo35eqdcqdmxjvguf3tp5wnfydopbthb2@xie6b7r53ote>
Date: Tue, 9 Dec 2025 10:26:22 +0100
From: Mateusz Guzik <mjguzik@...il.com>
To: Vlastimil Babka <vbabka@...e.cz>
Cc: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
David Laight <david.laight.linux@...il.com>, Andrew Morton <akpm@...ux-foundation.org>,
David Hildenbrand <david@...nel.org>, "Liam R . Howlett" <Liam.Howlett@...cle.com>,
Mike Rapoport <rppt@...nel.org>, Suren Baghdasaryan <surenb@...gle.com>,
Michal Hocko <mhocko@...e.com>, oliver.sang@...el.com, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] mm: avoid use of BIT() macro for initialising VMA flags
On Tue, Dec 09, 2025 at 09:28:10AM +0100, Vlastimil Babka wrote:
> As Mateusz pointed out off-list, the profiles look like mutexes are doing
> less optimistic spinning and more sleeping. Which IMHO isn't something that
> this change can directly affect.
>
Not mutexes but rwsems.
The bench at hand has some of the code spinlocked, other parts take
rwsems for reading *or* writing.
I had a peek at rwsem implementation and to my understanding it can
degrade to no spinning in a microbenchmark setting like this one,
provided you are unlucky enough.
In particular you can get unlucky if existing timings get perturbed,
which I presume is happening after Lorenzo's patch.
To demonstrate I wrote a toy patch which conditionally converts affected
down_read calls into down_write (inlined at the end).
While the original report is based on a 192-thread box, I was only able
to test with 80 threads. Even so, the crux of the issue was nicely
reproduced.
./stress-ng --timeout 10 --times --verify --metrics --no-rand-seed --msg 80
Top says (times vary, idle is growing over time):
%Cpu(s): 3.3 us, 24.4 sy, 0.0 ni, 72.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
... but if I flip the switch to down_write:
%Cpu(s): 6.3 us, 80.9 sy, 0.0 ni, 12.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
The switch is a sysctl named fs.magic_tunable (0 == down_read; 1 == down_write).
In terms of performance I see the following:
stress-ng: metrc: [2546] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Max
# sysctl fs.magic_tunable=0 ## down_read
stress-ng: metrc: [2546] msg 63353488 10.01 28.21 213.26 6331298.95 262362.91 30.16 2016
# sysctl fs.magic_tunable=1 ## down_write
stress-ng: metrc: [2036] msg 455014809 10.00 48.79 676.42 45496870.65 627425.68 90.64 2056
That is to say rwsem code is the real culprit and Lorenzo is a random
(albeit deserving) victim.
I see two action items:
- massage the patch back to a state where things compile to the same asm
as before as it clearly avoidably regressed regardless of the
aforementioned issue
- figure out what to do with rwsem code for read vs write spinning
I'm not picking this up for the time being, but I might look at this at
some point.
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4a3db4659a..de1ef700d144 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -109,6 +109,8 @@ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
+unsigned long magic_tunable;
+
static const struct ctl_table fs_stat_sysctls[] = {
{
.procname = "file-nr",
@@ -126,6 +128,16 @@ static const struct ctl_table fs_stat_sysctls[] = {
.extra1 = SYSCTL_LONG_ZERO,
.extra2 = SYSCTL_LONG_MAX,
},
+ {
+ .procname = "magic_tunable",
+ .data = &magic_tunable,
+ .maxlen = sizeof(magic_tunable),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = SYSCTL_LONG_ZERO,
+ .extra2 = SYSCTL_LONG_MAX,
+ },
+
{
.procname = "nr_open",
.data = &sysctl_nr_open,
diff --git a/ipc/msg.c b/ipc/msg.c
index ee6af4fe52bf..fa835ea53e09 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -474,6 +474,8 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
return err;
}
+extern unsigned long magic_tunable;
+
static int msgctl_info(struct ipc_namespace *ns, int msqid,
int cmd, struct msginfo *msginfo)
{
@@ -495,11 +497,19 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgmnb = ns->msg_ctlmnb;
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
- down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO)
- msginfo->msgpool = msg_ids(ns).in_use;
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
+ if (!READ_ONCE(magic_tunable)) {
+ down_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO)
+ msginfo->msgpool = msg_ids(ns).in_use;
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ } else {
+ down_write(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO)
+ msginfo->msgpool = msg_ids(ns).in_use;
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_write(&msg_ids(ns).rwsem);
+ }
if (cmd == MSG_INFO) {
msginfo->msgmap = min_t(int,
percpu_counter_sum(&ns->percpu_msg_hdrs),
diff --git a/ipc/util.c b/ipc/util.c
index cae60f11d9c2..c65c8289a54b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -771,6 +771,7 @@ struct ipc_proc_iter {
struct ipc_namespace *ns;
struct pid_namespace *pid_ns;
struct ipc_proc_iface *iface;
+ bool writelocked;
};
struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
@@ -828,6 +829,8 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos)
return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos);
}
+extern unsigned long magic_tunable;
+
/*
* File positions: pos 0 -> header, pos n -> ipc idx = n - 1.
* SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START.
@@ -844,7 +847,13 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos)
* Take the lock - this will be released by the corresponding
* call to stop().
*/
- down_read(&ids->rwsem);
+ if (!READ_ONCE(magic_tunable)) {
+ down_read(&ids->rwsem);
+ iter->writelocked = false;
+ } else {
+ down_write(&ids->rwsem);
+ iter->writelocked = true;
+ }
/* pos < 0 is invalid */
if (*pos < 0)
@@ -871,7 +880,10 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it)
ids = &iter->ns->ids[iface->ids];
/* Release the lock we took in start() */
- up_read(&ids->rwsem);
+ if (!iter->writelocked)
+ up_read(&ids->rwsem);
+ else
+ up_write(&ids->rwsem);
}
static int sysvipc_proc_show(struct seq_file *s, void *it)
Powered by blists - more mailing lists