linux-kernel - Re: futex performance regression from "futex: Allow automatic allocation of process wide futex hash"

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250604092815.UtG-oi0v@linutronix.de>
Date: Wed, 4 Jun 2025 11:28:15 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: Chris Mason <clm@...a.com>
Cc: Peter Zijlstra <peterz@...radead.org>, linux-kernel@...r.kernel.org
Subject: Re: futex performance regression from "futex: Allow automatic
 allocation of process wide futex hash"

On 2025-06-03 15:00:43 [-0400], Chris Mason wrote:
> Hi everyone,
Hi,

> While testing Peter's latest scheduler patches against current Linus
> git, I found a pretty big performance regression with schbench:
> 
> https://github.com/masoncl/schbench
> 
> The command line I was using:
> 
> schbench -L -m 4 -M auto -t 256 -n 0 -r 60 -s 0
> 
…
> schbench uses one futex per thread, and the command line ends up
> allocating 1024 threads, so the default bucket size used by this commit
> is just too small.  Using 2048 buckets makes the problem go away.

There is also this pthread_mutex_t but yeah

> On my big turin system, this commit slows down RPS by 36%.  But even a
> VM on a skylake machine sees a 29% difference.
> 
> schbench is a microbenchmark, so grain of salt on all of this, but I
> think our defaults are probably too low.

we could 

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index abbd97c2fcba8..9046f3d9693e7 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1687,7 +1687,7 @@ int futex_hash_allocate_default(void)
 	scoped_guard(rcu) {
 		threads = min_t(unsigned int,
 				get_nr_threads(current),
-				num_online_cpus());
+				num_online_cpus() * 2);
 
 		fph = rcu_dereference(current->mm->futex_phash);
 		if (fph) {

which would increase it to 2048 as Chris asks for.
Then there the possibility of 

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index abbd97c2fcba8..a19a96cc09c9e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1680,6 +1680,8 @@ int futex_hash_allocate_default(void)
 {
 	unsigned int threads, buckets, current_buckets = 0;
 	struct futex_private_hash *fph;
+	bool current_immutable = false;
+	unsigned int flags = 0;
 
 	if (!current->mm)
 		return 0;
@@ -1695,6 +1697,7 @@ int futex_hash_allocate_default(void)
 				return 0;
 
 			current_buckets = fph->hash_mask + 1;
+			current_immutable = fph->immutable;
 		}
 	}
 
@@ -1705,10 +1708,13 @@ int futex_hash_allocate_default(void)
 	buckets = roundup_pow_of_two(4 * threads);
 	buckets = clamp(buckets, 16, futex_hashmask + 1);
 
-	if (current_buckets >= buckets)
-		return 0;
+	if (current_buckets >= buckets) {
+		if (current_immutable)
+			return 0;
+		flags = FH_IMMUTABLE;
+	}
 
-	return futex_hash_allocate(buckets, 0);
+	return futex_hash_allocate(buckets, flags);
 }
 
 static int futex_hash_get_slots(void)

to make hash immutable once the upper limit has been reached. There will
be no more auto-resize. One could argue that if the user did not touch
it, he might not do it at all.

This would avoid the reference accounting. Some testing:

256 cores, 2xNUMA:
| average rps: 1 701 947.02 Futex HBs: 0 immutable: 1
| average rps:   785 446.07 Futex HBs: 1024 immutable: 0
| average rps: 1 586 755.62 Futex HBs: 1024 immutable: 1
| average rps:   736 769.77 Futex HBs: 2048 immutable: 0
| average rps: 1 555 182.52 Futex HBs: 2048 immutable: 1

144 cores, 4xNUMA:
| average rps: 2 691 912.55 Futex HBs: 0 immutable: 1
| average rps: 1 306 443.68 Futex HBs: 1024 immutable: 0
| average rps: 2 471 382.28 Futex HBs: 1024 immutable: 1
| average rps: 1 269 503.90 Futex HBs: 2048 immutable: 0
| average rps: 2 656 228.67 Futex HBs: 2048 immutable: 1

tested with this on top:

diff --git a/schbench.c b/schbench.c
index 1be3e280f5c38..40a5f0091111e 100644
--- a/schbench.c
+++ b/schbench.c
@@ -19,6 +19,8 @@
 #include <unistd.h>
 #include <errno.h>
 #include <getopt.h>
+#include <linux/prctl.h>
+#include <sys/prctl.h>
 #include <sys/time.h>
 #include <time.h>
 #include <string.h>
@@ -42,6 +44,9 @@
 
 #define USEC_PER_SEC (1000000)
 
+static int futex_size = -1;
+static int futex_flags;
+
 /* -m number of message threads */
 static int message_threads = 1;
 /* -t  number of workers per message thread */
@@ -127,7 +132,7 @@ enum {
 	HELP_LONG_OPT = 1,
 };
 
-char *option_string = "p:m:M:W:t:Cr:R:w:i:z:A:n:F:Lj:s:J:";
+char *option_string = "p:m:M:W:t:Cr:R:w:i:z:A:n:F:Lj:s:J:H:I";
 static struct option long_options[] = {
 	{"pipe", required_argument, 0, 'p'},
 	{"message-threads", required_argument, 0, 'm'},
@@ -176,6 +181,29 @@ static void print_usage(void)
 	exit(1);
 }
 
+#ifndef PR_FUTEX_HASH
+#define PR_FUTEX_HASH                   78
+# define PR_FUTEX_HASH_SET_SLOTS        1
+# define FH_FLAG_IMMUTABLE              (1ULL << 0)
+# define PR_FUTEX_HASH_GET_SLOTS        2
+# define PR_FUTEX_HASH_GET_IMMUTABLE    3
+#endif
+
+static int futex_hash_slots_set(unsigned int slots, int flags)
+{
+	return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, slots, flags);
+}
+
+static int futex_hash_slots_get(void)
+{
+	return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
+}
+
+static int futex_hash_immutable_get(void)
+{
+	return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE);
+}
+
 /*
  * returns 0 if we fail to parse and 1 if we succeed
  */
@@ -347,6 +375,13 @@ static void parse_options(int ac, char **av)
 				exit(1);
 			}
 			break;
+		case 'H':
+			futex_size = atoi(optarg);
+			break;
+		case 'I':
+			futex_flags = FH_FLAG_IMMUTABLE;
+			break;
+
 		case '?':
 		case HELP_LONG_OPT:
 			print_usage();
@@ -1811,6 +1846,9 @@ int main(int ac, char **av)
 		}
 	}
 
+	if (futex_size >= 0)
+		futex_hash_slots_set(futex_size, futex_flags);
+
 	requests_per_sec /= message_threads;
 	loops_per_sec = 0;
 	stopping = 0;
@@ -1920,6 +1958,8 @@ int main(int ac, char **av)
 	}
 	free(message_threads_mem);
 
+	fprintf(stderr, "Futex HBs: %d immutable: %d\n", futex_hash_slots_get(),
+		futex_hash_immutable_get());
 
 	return 0;
 }


Comments?

> -chris

Sebastian