[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4qzknwustcw7vkpky3z5kvkmjp3burhwipivfeqr4cine3i4x5@772ocqtmmcoj>
Date: Thu, 22 Jan 2026 21:44:50 +0100
From: Mateusz Guzik <mjguzik@...il.com>
To: Qiliang Yuan <realwujing@...il.com>
Cc: viro@...iv.linux.org.uk, brauner@...nel.org, jack@...e.cz,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org, yuanql9@...natelecom.cn
Subject: Re: [PATCH] fs/file: optimize FD allocation complexity with 3-level
summary bitmap
On Thu, Jan 22, 2026 at 12:03:45PM -0500, Qiliang Yuan wrote:
> Current FD allocation performs a two-level bitmap search (open_fds and
> full_fds_bits). This results in O(N/64) complexity when searching for a
> free FD, as the kernel needs to scan the first-level summary bitmap.
>
> For processes with very large FD limits (e.g., millions of sockets),
> scanning even the level 1 summary bitmap can become a bottleneck during
> high-frequency allocation.
>
> This patch introduces a third level of summary bitmap (full_fds_bits_l2),
> where each bit represents whether a 64-word chunk (4096 FDs) in open_fds
> is fully allocated. This reduces the search complexity to O(N/4096),
> making FD allocation significantly more scalable for high-concurrency
> workloads.
>
Do you have results to justify this change?
Note this adds more bouncing memory accesses to the lock-protected area.
The centralized locking is a long stranding problem and I'm not
convinced there is more place for lipstick on that pig.
The real solution(tm) would instead make it feasible to not need said
locking.
As is all programs are shafted because any fd-allocating syscall is
required to obtain lowest fd available.
In another life I suggested introducing a new flag for accept, open et
al which would waive that requirement for that specific invocation in
turn allowing scalable allocation instead.
Say it would be called O_ANYFD/SOCK_ANYFD.
Then the fd space can be partitioned in some capacity -- maybe per
thread, maybe per cpu, maybe some other granularity, but it would no
longer be centralized.
Code which knowns about the feature and is fine with it can pass the
flag and in turn allocate from one of the extra tables by default.
Code oblivious to the feature would not see a change in behavior, as
absent the flag the kernel would ensure to find the lowest fd.
Ultimately any program written with performance in mind can be then
trivially tweaked to use it and get the benefit. The hard part is making
it sensibly work.
> Signed-off-by: Qiliang Yuan <realwujing@...il.com>
> Signed-off-by: Qiliang Yuan <yuanql9@...natelecom.cn>
> ---
> fs/file.c | 45 +++++++++++++++++++++++++++++++++--------
> include/linux/fdtable.h | 2 ++
> 2 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/fs/file.c b/fs/file.c
> index 0a4f3bdb2dec..1163160e81af 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -114,6 +114,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
>
> #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
> #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
> +#define BITBITBIT_NR(nr) BITS_TO_LONGS(BITBIT_NR(nr))
> +#define BITBITBIT_SIZE(nr) (BITBITBIT_NR(nr) * sizeof(long))
>
> #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
> /*
> @@ -132,6 +134,8 @@ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
> copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
> bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
> copy_words, nwords);
> + bitmap_copy_and_extend(nfdt->full_fds_bits_l2, ofdt->full_fds_bits_l2,
> + BITS_TO_LONGS(copy_words), BITS_TO_LONGS(nwords));
> }
>
> /*
> @@ -222,7 +226,7 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
> fdt->fd = data;
>
> data = kvmalloc(max_t(size_t,
> - 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
> + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr) + BITBITBIT_SIZE(nr), L1_CACHE_BYTES),
> GFP_KERNEL_ACCOUNT);
> if (!data)
> goto out_arr;
> @@ -231,6 +235,8 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
> fdt->close_on_exec = data;
> data += nr / BITS_PER_BYTE;
> fdt->full_fds_bits = data;
> + data += BITBIT_SIZE(nr);
> + fdt->full_fds_bits_l2 = data;
>
> return fdt;
>
> @@ -335,16 +341,24 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
> __set_bit(fd, fdt->open_fds);
> __set_close_on_exec(fd, fdt, set);
> fd /= BITS_PER_LONG;
> - if (!~fdt->open_fds[fd])
> + if (!~fdt->open_fds[fd]) {
> __set_bit(fd, fdt->full_fds_bits);
> + unsigned int idx = fd / BITS_PER_LONG;
> + if (!~fdt->full_fds_bits[idx])
> + __set_bit(idx, fdt->full_fds_bits_l2);
> + }
> }
>
> static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
> {
> __clear_bit(fd, fdt->open_fds);
> fd /= BITS_PER_LONG;
> - if (test_bit(fd, fdt->full_fds_bits))
> + if (test_bit(fd, fdt->full_fds_bits)) {
> __clear_bit(fd, fdt->full_fds_bits);
> + unsigned int idx = fd / BITS_PER_LONG;
> + if (test_bit(idx, fdt->full_fds_bits_l2))
> + __clear_bit(idx, fdt->full_fds_bits_l2);
> + }
> }
>
> static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
> @@ -402,6 +416,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
> new_fdt->close_on_exec = newf->close_on_exec_init;
> new_fdt->open_fds = newf->open_fds_init;
> new_fdt->full_fds_bits = newf->full_fds_bits_init;
> + new_fdt->full_fds_bits_l2 = newf->full_fds_bits_init_l2;
> new_fdt->fd = &newf->fd_array[0];
>
> spin_lock(&oldf->file_lock);
> @@ -536,6 +551,7 @@ struct files_struct init_files = {
> .close_on_exec = init_files.close_on_exec_init,
> .open_fds = init_files.open_fds_init,
> .full_fds_bits = init_files.full_fds_bits_init,
> + .full_fds_bits_l2 = init_files.full_fds_bits_init_l2,
> },
> .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
> .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
> @@ -545,22 +561,35 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
> {
> unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
> unsigned int maxbit = maxfd / BITS_PER_LONG;
> + unsigned int maxbit_l2 = BITBIT_NR(maxfd);
> unsigned int bitbit = start / BITS_PER_LONG;
> + unsigned int bitbit_l2 = bitbit / BITS_PER_LONG;
> unsigned int bit;
>
> /*
> - * Try to avoid looking at the second level bitmap
> + * Try to avoid looking at the upper level bitmaps
> */
> bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
> start & (BITS_PER_LONG - 1));
> if (bit < BITS_PER_LONG)
> return bit + bitbit * BITS_PER_LONG;
>
> - bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
> - if (bitbit >= maxfd)
> + /* Algorithmic Optimization: O(N) -> O(1) via 3rd-level summary bitmap */
> + bitbit_l2 = find_next_zero_bit(fdt->full_fds_bits_l2, maxbit_l2, bitbit_l2);
> + if (bitbit_l2 >= maxbit_l2)
> return maxfd;
> - if (bitbit > start)
> - start = bitbit;
> +
> + if (bitbit_l2 * BITS_PER_LONG > bitbit)
> + bitbit = bitbit_l2 * BITS_PER_LONG;
> +
> + bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit);
> + if (bitbit >= maxbit)
> + return maxfd;
> +
> + bit = bitbit * BITS_PER_LONG;
> + if (bit > start)
> + start = bit;
> +
> return find_next_zero_bit(fdt->open_fds, maxfd, start);
> }
>
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index c45306a9f007..992b4ed9c1e0 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -29,6 +29,7 @@ struct fdtable {
> unsigned long *close_on_exec;
> unsigned long *open_fds;
> unsigned long *full_fds_bits;
> + unsigned long *full_fds_bits_l2;
> struct rcu_head rcu;
> };
>
> @@ -53,6 +54,7 @@ struct files_struct {
> unsigned long close_on_exec_init[1];
> unsigned long open_fds_init[1];
> unsigned long full_fds_bits_init[1];
> + unsigned long full_fds_bits_init_l2[1];
> struct file __rcu * fd_array[NR_OPEN_DEFAULT];
> };
>
> --
> 2.51.0
>
Powered by blists - more mailing lists