[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAGudoHFLqYbdqVdsGqWGiJNrZ+k8Ca2QUiB2oBfTmvbB4kLaAA@mail.gmail.com>
Date: Wed, 19 Nov 2025 19:49:39 +0100
From: Mateusz Guzik <mjguzik@...il.com>
To: brauner@...nel.org
Cc: viro@...iv.linux.org.uk, jack@...e.cz, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH] fs: inline step_into() and walk_component()
Attached benchmark code, ploppable into will-it-scale.
On Wed, Nov 19, 2025 at 7:40 PM Mateusz Guzik <mjguzik@...il.com> wrote:
>
> The primary consumer is link_path_walk(), calling walk_component() every
> time which in turn calls step_into().
>
> Inlining these saves overhead of 2 function calls per path component,
> along with allowing the compiler to do better job optimizing them in place.
>
> step_into() had absolutely atrocious assembly to facilitate the
> slowpath. In order to lessen the burden at the callsite all the hard
> work is moved into step_into_slowpath(). This also elides some of the
> branches as for example LOOKUP_RCU is only checked once.
>
> The inline-able step_into() variant deals with the common case of
> traversing a cached non-mounted on directory while in RCU mode.
>
> Since symlink handling is already denoted as unlikely(), I took the
> opportunity to also shorten step_into_slowpath() by moving parts of it
> into pick_link() which further shortens assembly.
>
> Benchmarked as follows on Sapphire Rapids:
> 1. the "before" was a kernel with not-yet-merged optimizations (notably
> elision of calls to security_inode_permissin() and marking ext4
> inodes as not having acls)
> 2. "after" is the same + this patch
> 3. benchmark consists of issuing 205 calls to access(2) in a loop with
> pathnames lifted out of gcc and the linker building real code, most
> of which have several path components and 118 of which fail with
> -ENOENT. Some of those do symlink traversal.
>
> In terms of ops/s:
> before: 21619
> after: 22536 (+4%)
>
> profile before:
> 20.25% [kernel] [k] __d_lookup_rcu
> 10.54% [kernel] [k] link_path_walk
> 10.22% [kernel] [k] entry_SYSCALL_64
> 6.50% libc.so.6 [.] __GI___access
> 6.35% [kernel] [k] strncpy_from_user
> 4.87% [kernel] [k] step_into
> 3.68% [kernel] [k] kmem_cache_alloc_noprof
> 2.88% [kernel] [k] walk_component
> 2.86% [kernel] [k] kmem_cache_free
> 2.14% [kernel] [k] set_root
> 2.08% [kernel] [k] lookup_fast
>
> after:
> 23.38% [kernel] [k] __d_lookup_rcu
> 11.27% [kernel] [k] entry_SYSCALL_64
> 10.89% [kernel] [k] link_path_walk
> 7.00% libc.so.6 [.] __GI___access
> 6.88% [kernel] [k] strncpy_from_user
> 3.50% [kernel] [k] kmem_cache_alloc_noprof
> 2.01% [kernel] [k] kmem_cache_free
> 2.00% [kernel] [k] set_root
> 1.99% [kernel] [k] lookup_fast
> 1.81% [kernel] [k] do_syscall_64
> 1.69% [kernel] [k] entry_SYSCALL_64_safe_stack
>
> While walk_component() and step_into() of course disappear from the
> profile, the link_path_walk() barely gets more overhead despite the
> inlining thanks to the fast path added and while completing more walks
> per second.
>
> I don't know why overhead grew a lot on __d_lookup_rcu().
>
> Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> ---
>
> perhaps this could be 2-3 patches instead to do things incrementally
> the other patches not listed in the commit message are:
> 1. mntput_not_expire slowpath (Christian took it, not present in fs-next yet)
> 2. nd->depth predicts.
> 3. lookup_slow noinline
>
> all of those sent
>
> you may notice step_into is quite high on the profile:
> 4.87% [kernel] [k] step_into
>
> here is the routine prior to the patch:
> call ffffffff81374630 <__fentry__>
> push %rbp
> mov %rsp,%rbp
> push %r15
> push %r14
> mov %esi,%r14d
> push %r13
> push %r12
> push %rbx
> mov %rdi,%rbx
> sub $0x28,%rsp
> mov (%rdi),%rax
> mov 0x38(%rdi),%r8d
> mov %gs:0x2e10acd(%rip),%r12 # ffffffff84551008 <__stack_chk_guard>
>
> mov %r12,0x20(%rsp)
> mov %rdx,%r12
> mov %rdx,0x18(%rsp)
> mov %rax,0x10(%rsp)
>
> This is setup before it even gets to do anything which of course has
> to be undone later. Also note the stackguard check. I'm not pasting
> everything you can disasm yourself to check the entire monstrosity.
>
> In contrast this is entirety of step_into() fast path with the patch + noinline:
> call ffffffff81374630 <__fentry__>
> testb $0x1,0x39(%rdi)
> je ffffffff81740cbe <step_into+0x4e>
> mov (%rdx),%eax
> test $0x38000,%eax
> jne ffffffff81740cbe <step_into+0x4e>
> and $0x380000,%eax
> mov 0x30(%rdx),%rcx
> cmp $0x300000,%eax
> je ffffffff81740cbe <step_into+0x4e>
> mov (%rdi),%r8
> mov 0x44(%rdi),%esi
> mov 0x4(%rdx),%eax
> cmp %eax,%esi
> jne ffffffff81740cc3 <step_into+0x53>
> test %rcx,%rcx
> je ffffffff81740ccf <step_into+0x5f>
> mov 0x44(%rdi),%eax
> mov %r8,(%rdi)
> mov %rdx,0x8(%rdi)
> mov %eax,0x40(%rdi)
> xor %eax,%eax
> mov %rcx,0x30(%rdi)
> jmp ffffffff8230b1f0 <__pi___x86_return_thunk>
>
> This possibly can be shortened but I have not tried yet.
>
> fs/namei.c | 67 ++++++++++++++++++++++++++++++++++++++++--------------
> 1 file changed, 50 insertions(+), 17 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 1d1f864ad6ad..e00b9ce21536 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1668,17 +1668,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
> bool jumped;
> int ret;
>
> - path->mnt = nd->path.mnt;
> - path->dentry = dentry;
> if (nd->flags & LOOKUP_RCU) {
> unsigned int seq = nd->next_seq;
> + if (likely(!(dentry->d_flags & DCACHE_MANAGED_DENTRY)))
> + return 0;
> if (likely(__follow_mount_rcu(nd, path)))
> return 0;
> // *path and nd->next_seq might've been clobbered
> path->mnt = nd->path.mnt;
> path->dentry = dentry;
> nd->next_seq = seq;
> - if (!try_to_unlazy_next(nd, dentry))
> + if (unlikely(!try_to_unlazy_next(nd, dentry)))
> return -ECHILD;
> }
> ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
> @@ -1941,12 +1941,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
>
> enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
>
> -static const char *pick_link(struct nameidata *nd, struct path *link,
> +static noinline const char *pick_link(struct nameidata *nd, struct path *link,
> struct inode *inode, int flags)
> {
> struct saved *last;
> const char *res;
> - int error = reserve_stack(nd, link);
> + int error;
> +
> + if (nd->flags & LOOKUP_RCU) {
> + /* make sure that d_is_symlink above matches inode */
> + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
> + return ERR_PTR(-ECHILD);
> + } else {
> + if (link->mnt == nd->path.mnt)
> + mntget(link->mnt);
> + }
> +
> + error = reserve_stack(nd, link);
>
> if (unlikely(error)) {
> if (!(nd->flags & LOOKUP_RCU))
> @@ -2021,14 +2032,17 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
> *
> * NOTE: dentry must be what nd->next_seq had been sampled from.
> */
> -static const char *step_into(struct nameidata *nd, int flags,
> +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
> struct dentry *dentry)
> {
> struct path path;
> struct inode *inode;
> - int err = handle_mounts(nd, dentry, &path);
> + int err;
>
> - if (err < 0)
> + path.mnt = nd->path.mnt;
> + path.dentry = dentry;
> + err = handle_mounts(nd, dentry, &path);
> + if (unlikely(err < 0))
> return ERR_PTR(err);
> inode = path.dentry->d_inode;
> if (likely(!d_is_symlink(path.dentry)) ||
> @@ -2050,17 +2064,36 @@ static const char *step_into(struct nameidata *nd, int flags,
> nd->seq = nd->next_seq;
> return NULL;
> }
> - if (nd->flags & LOOKUP_RCU) {
> - /* make sure that d_is_symlink above matches inode */
> - if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> - return ERR_PTR(-ECHILD);
> - } else {
> - if (path.mnt == nd->path.mnt)
> - mntget(path.mnt);
> - }
> return pick_link(nd, &path, inode, flags);
> }
>
> +static __always_inline const char *step_into(struct nameidata *nd, int flags,
> + struct dentry *dentry)
> +{
> + struct path path;
> + struct inode *inode;
> +
> + path.mnt = nd->path.mnt;
> + path.dentry = dentry;
> + if (!(nd->flags & LOOKUP_RCU))
> + goto slowpath;
> + if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
> + goto slowpath;
> + inode = path.dentry->d_inode;
> + if (unlikely(d_is_symlink(path.dentry)))
> + goto slowpath;
> + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> + return ERR_PTR(-ECHILD);
> + if (unlikely(!inode))
> + return ERR_PTR(-ENOENT);
> + nd->path = path;
> + nd->inode = inode;
> + nd->seq = nd->next_seq;
> + return NULL;
> +slowpath:
> + return step_into_slowpath(nd, flags, dentry);
> +}
> +
> static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
> {
> struct dentry *parent, *old;
> @@ -2171,7 +2204,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
> return NULL;
> }
>
> -static const char *walk_component(struct nameidata *nd, int flags)
> +static __always_inline const char *walk_component(struct nameidata *nd, int flags)
> {
> struct dentry *dentry;
> /*
> --
> 2.48.1
>
View attachment "access_compile.c" of type "text/x-csrc" (12408 bytes)
Powered by blists - more mailing lists