lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260122170345.157803-1-realwujing@gmail.com>
Date: Thu, 22 Jan 2026 12:03:45 -0500
From: Qiliang Yuan <realwujing@...il.com>
To: viro@...iv.linux.org.uk,
	brauner@...nel.org
Cc: jack@...e.cz,
	linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	yuanql9@...natelecom.cn,
	Qiliang Yuan <realwujing@...il.com>
Subject: [PATCH] fs/file: optimize FD allocation complexity with 3-level summary bitmap

Current FD allocation performs a two-level bitmap search (open_fds and
full_fds_bits). This results in O(N/64) complexity when searching for a
free FD, as the kernel needs to scan the first-level summary bitmap.

For processes with very large FD limits (e.g., millions of sockets),
scanning even the level 1 summary bitmap can become a bottleneck during
high-frequency allocation.

This patch introduces a third level of summary bitmap (full_fds_bits_l2),
where each bit represents whether a 64-word chunk (4096 FDs) in open_fds
is fully allocated. This reduces the search complexity to O(N/4096),
making FD allocation significantly more scalable for high-concurrency
workloads.

Signed-off-by: Qiliang Yuan <realwujing@...il.com>
Signed-off-by: Qiliang Yuan <yuanql9@...natelecom.cn>
---
 fs/file.c               | 45 +++++++++++++++++++++++++++++++++--------
 include/linux/fdtable.h |  2 ++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 0a4f3bdb2dec..1163160e81af 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -114,6 +114,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 
 #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
 #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
+#define BITBITBIT_NR(nr) BITS_TO_LONGS(BITBIT_NR(nr))
+#define BITBITBIT_SIZE(nr) (BITBITBIT_NR(nr) * sizeof(long))
 
 #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
 /*
@@ -132,6 +134,8 @@ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
 			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
 	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
 			copy_words, nwords);
+	bitmap_copy_and_extend(nfdt->full_fds_bits_l2, ofdt->full_fds_bits_l2,
+			BITS_TO_LONGS(copy_words), BITS_TO_LONGS(nwords));
 }
 
 /*
@@ -222,7 +226,7 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
 	fdt->fd = data;
 
 	data = kvmalloc(max_t(size_t,
-				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
+				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr) + BITBITBIT_SIZE(nr), L1_CACHE_BYTES),
 				 GFP_KERNEL_ACCOUNT);
 	if (!data)
 		goto out_arr;
@@ -231,6 +235,8 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
 	fdt->close_on_exec = data;
 	data += nr / BITS_PER_BYTE;
 	fdt->full_fds_bits = data;
+	data += BITBIT_SIZE(nr);
+	fdt->full_fds_bits_l2 = data;
 
 	return fdt;
 
@@ -335,16 +341,24 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
 	__set_bit(fd, fdt->open_fds);
 	__set_close_on_exec(fd, fdt, set);
 	fd /= BITS_PER_LONG;
-	if (!~fdt->open_fds[fd])
+	if (!~fdt->open_fds[fd]) {
 		__set_bit(fd, fdt->full_fds_bits);
+		unsigned int idx = fd / BITS_PER_LONG;
+		if (!~fdt->full_fds_bits[idx])
+			__set_bit(idx, fdt->full_fds_bits_l2);
+	}
 }
 
 static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
 {
 	__clear_bit(fd, fdt->open_fds);
 	fd /= BITS_PER_LONG;
-	if (test_bit(fd, fdt->full_fds_bits))
+	if (test_bit(fd, fdt->full_fds_bits)) {
 		__clear_bit(fd, fdt->full_fds_bits);
+		unsigned int idx = fd / BITS_PER_LONG;
+		if (test_bit(idx, fdt->full_fds_bits_l2))
+			__clear_bit(idx, fdt->full_fds_bits_l2);
+	}
 }
 
 static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
@@ -402,6 +416,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
 	new_fdt->close_on_exec = newf->close_on_exec_init;
 	new_fdt->open_fds = newf->open_fds_init;
 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
+	new_fdt->full_fds_bits_l2 = newf->full_fds_bits_init_l2;
 	new_fdt->fd = &newf->fd_array[0];
 
 	spin_lock(&oldf->file_lock);
@@ -536,6 +551,7 @@ struct files_struct init_files = {
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 		.full_fds_bits	= init_files.full_fds_bits_init,
+		.full_fds_bits_l2 = init_files.full_fds_bits_init_l2,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
@@ -545,22 +561,35 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
 {
 	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
 	unsigned int maxbit = maxfd / BITS_PER_LONG;
+	unsigned int maxbit_l2 = BITBIT_NR(maxfd);
 	unsigned int bitbit = start / BITS_PER_LONG;
+	unsigned int bitbit_l2 = bitbit / BITS_PER_LONG;
 	unsigned int bit;
 
 	/*
-	 * Try to avoid looking at the second level bitmap
+	 * Try to avoid looking at the upper level bitmaps
 	 */
 	bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
 				 start & (BITS_PER_LONG - 1));
 	if (bit < BITS_PER_LONG)
 		return bit + bitbit * BITS_PER_LONG;
 
-	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
-	if (bitbit >= maxfd)
+	/* Algorithmic Optimization: O(N) -> O(1) via 3rd-level summary bitmap */
+	bitbit_l2 = find_next_zero_bit(fdt->full_fds_bits_l2, maxbit_l2, bitbit_l2);
+	if (bitbit_l2 >= maxbit_l2)
 		return maxfd;
-	if (bitbit > start)
-		start = bitbit;
+
+	if (bitbit_l2 * BITS_PER_LONG > bitbit)
+		bitbit = bitbit_l2 * BITS_PER_LONG;
+
+	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit);
+	if (bitbit >= maxbit)
+		return maxfd;
+
+	bit = bitbit * BITS_PER_LONG;
+	if (bit > start)
+		start = bit;
+
 	return find_next_zero_bit(fdt->open_fds, maxfd, start);
 }
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index c45306a9f007..992b4ed9c1e0 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -29,6 +29,7 @@ struct fdtable {
 	unsigned long *close_on_exec;
 	unsigned long *open_fds;
 	unsigned long *full_fds_bits;
+	unsigned long *full_fds_bits_l2;
 	struct rcu_head rcu;
 };
 
@@ -53,6 +54,7 @@ struct files_struct {
 	unsigned long close_on_exec_init[1];
 	unsigned long open_fds_init[1];
 	unsigned long full_fds_bits_init[1];
+	unsigned long full_fds_bits_init_l2[1];
 	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 
-- 
2.51.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ