lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <46d5c480-87d0-4f6a-bcc2-6c936c87e216@redhat.com>
Date: Wed, 4 Feb 2026 13:16:15 -0500
From: Waiman Long <llong@...hat.com>
To: Al Viro <viro@...iv.linux.org.uk>, Waiman Long <llong@...hat.com>
Cc: Paul Moore <paul@...l-moore.com>, Eric Paris <eparis@...hat.com>,
 Christian Brauner <brauner@...nel.org>, linux-kernel@...r.kernel.org,
 audit@...r.kernel.org, Richard Guy Briggs <rgb@...hat.com>,
 Ricardo Robaina <rrobaina@...hat.com>
Subject: Re: [PATCH v2] audit: Avoid excessive dput/dget in audit_context
 setup and reset paths


On 2/4/26 1:26 AM, Al Viro wrote:
> On Tue, Feb 03, 2026 at 11:21:23PM -0500, Waiman Long wrote:
>
>> Interesting. So are you thinking about a reference on the pwd inside the
>> fs_struct?
> No.  fs_struct has pwd pinned - both dentry and mount.
>
> Each thread has a reference to fs_struct instance in its task_struct (task->fs).
>
> It is possible for several threads to have their task->fs pointing to the
> same instance.
>
> A thread may modify its fs_struct pointer, making it point to a different
> fs_struct instance; that happens on unshare(2).  In that case new instance
> is an identical copy of the original one; reference counts of mounts and dentries
> (both for pwd and root) are incremented to account for additional references
> in the copy.  That assignment to current->fs happens under task_lock(current).
>
> A child gets either a reference to identical copy of parent's fs_struct or
> an extra reference to parent's fs_struct.  In either case child->fs is
> set before the child gets to run; as the matter of fact, that happens before
> anyone besides the parent might observe the task_struct of child.
>
> That's it - no other stores to task_struct.fs are ever allowed; no other
> thread can change your reference under you.
>
>
> Other than initializing a new copy, *all* stores to fs_struct.pwd are done
> to current->fs.pwd; access to any other fs_struct instances is read-only.
> The only way another thread might change your pwd is if that thread shares
> fs_struct with you.  They can observe its contents, as long as they take
> care to hold task_lock(your_thread), but no more than that.
>
> What's more, if current->fs is the sole reference to fs_struct instance,
> it will remain such until you spawn a child and set it to share your
> instance (CLONE_FS).  No other thread can gain extra references to it,
> etc.
>
>
> What it means is that if you are holding the sole reference to current->fs,
> current->fs->pwd contents will remain unchanged (and pinning the same mount
> and dentry) through the entire syscall with very few exceptions:
> 	1) you spawn a child with CLONE_FS - that has to be either
> clone(2) or io_uring_setup(2) (the latter - via worker threads).
> 	2) you explicitly modify your pwd - in chdir(2), fchdir(2),
> pivot_root(2), setns(2) (with CLONE_NEWNS; it switches you to the root of
> namespace you've joined)
>
> As long as these exceptions are accounted for, audit could check
> current->fs->users *and* skip incrementing refcounts if it's equal to 1.
> It would need to remember whether these reference had been taken -
> rechecking condition won't work, since other threads might by gone
> by the time we leave the syscall turning "shared" to "not shared".
>
> That way we can avoid grabbing any references in a fairly common
> case.
Thanks for the detailed explanation. I am thinking about something like
the code diff below. Of course, there are other corner cases like unshare(2)
that still needs to be handled. Do you think something like this is viable?

Thanks,
Longman

=========================[ Cut here ]===================================

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..09c97059776d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -25,6 +25,23 @@ void set_fs_root(struct fs_struct *fs, const struct 
path *pa>
                 path_put(&old_root);
  }

+static void unshare_fs_pwd_locked(struct fs_struct *fs)
+{
+       get_task_struct(current);
+       fs->pwd_waiter = current;
+retry:
+       spin_unlock(&fs->seq.lock);
+       /* Sleep until pwd_refs reaches 0 */
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule();
+       spin_lock(&fs->seq.lock);
+       if (fs->pwd_refs)
+               goto retry;
+       __set_current_state(TASK_RUNNING);
+       fs->pwd_waiter = NULL;
+       put_task_struct(current);
+}
+
  /*
   * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
   * It can block.
@@ -34,7 +51,15 @@ void set_fs_pwd(struct fs_struct *fs, const struct 
path *pat>
         struct path old_pwd;

         path_get(path);
-       write_seqlock(&fs->seq);
+       /*
+        * As the pwd may be shared with other tasks, we need to break down
+        * the write_seqlock() call to its component spin_lock() and
+        * do_write_seqcount_begin() calls.
+        */
+       spin_lock(&fs->seq.lock);
+       if (unlikely(fs->pwd_refs))
+               unshare_fs_pwd_locked(fs);
+  do_write_seqcount_begin(&fs->seq.seqcount.seqcount);
         old_pwd = fs->pwd;
         fs->pwd = *path;
         write_sequnlock(&fs->seq);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..3848189893c7 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -13,6 +13,8 @@ struct fs_struct {
         int umask;
         int in_exec;
         struct path root, pwd;
+       int pwd_refs;
+       struct task_struct *pwd_waiter; /* set_fs_pwd() waiter */
  } __randomize_layout;

  extern struct kmem_cache *fs_cachep;
@@ -40,6 +42,43 @@ static inline void get_fs_pwd(struct fs_struct *fs, 
struct p>
         read_sequnlock_excl(&fs->seq);
  }

+/*
+ * The get_fs_pwd_share/put_fs_pwd_share APIs should only be used by 
callers
+ * that need to keep the pwd references for a short time to avoid blocking
+ * set_fs_pwd() caller for long time.
+ */
+static inline bool get_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+       bool share = true;
+
+       read_seqlock_excl(&fs->seq);
+       *pwd = fs->pwd;
+       share = !fs->pwd_waiter;
+       if (likely(share))
+               fs->pwd_refs++;
+       else
+               path_get(pwd);
+       read_sequnlock_excl(&fs->seq);
+       return share;
+}
+
+static inline void put_fs_pwd_share(struct fs_struct *fs, struct path 
*pwd, bo>
+{
+       struct task_struct *wakeup_task = NULL;
+
+       if (!share) {
+               path_put(pwd);
+               return;
+       }
+       read_seqlock_excl(&fs->seq);
+       fs->pwd_refs--;
+       if (fs->pwd_waiter && !fs->pwd_refs)
+               wakeup_task = fs->pwd_waiter;
+       read_sequnlock_excl(&fs->seq);
+       if (wakeup_task)
+               wake_up_process(wakeup_task);
+}
+
  extern bool current_chrooted(void);

  static inline int current_umask(void)


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ