[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAGudoHHqzayp2yY4dtqVSQj30iONngSCpy34_f-4ZkMyj8mh5Q@mail.gmail.com>
Date: Tue, 21 Oct 2025 14:54:16 +0200
From: Mateusz Guzik <mjguzik@...il.com>
To: Christian Brauner <brauner@...nel.org>
Cc: viro@...iv.linux.org.uk, jack@...e.cz, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH] fs: rework I_NEW handling to operate without fences
On Tue, Oct 21, 2025 at 2:48 PM Christian Brauner <brauner@...nel.org> wrote:
>
> On Wed, Oct 15, 2025 at 01:50:25PM +0200, Mateusz Guzik wrote:
> > can i get some flames on this?
>
> Ok, that looks fine to me. I don't particularly enjoy that boolean but I
> think it simplifies d_instantiate_new() enough to make up for it.
>
yes, going to sleep (or not) is lock-protected, obsoleting the need
for custom fences.
this is an incremental cleanup. I have more in the pipeline which
should dedup most of the current handling, but there is quite a bit to
clean up first and I'm have not decided to which way I'll try go
first.
> >
> > On Sat, Oct 11, 2025 at 12:17 AM Mateusz Guzik <mjguzik@...il.com> wrote:
> > >
> > > In the inode hash code grab the state while ->i_lock is held. If found
> > > to be set, synchronize the sleep once more with the lock held.
> > >
> > > In the real world the flag is not set most of the time.
> > >
> > > Apart from being simpler to reason about, it comes with a minor speed up
> > > as now clearing the flag does not require the smp_mb() fence.
> > >
> > > While here rename wait_on_inode() to wait_on_new_inode() to line it up
> > > with __wait_on_freeing_inode().
> > >
> > > Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> > > ---
> > >
> > > This temporarily duplicated sleep code from inode_wait_for_lru_isolating().
> > > This is going to get dedupped later.
> > >
> > > There is high repetition of:
> > > if (unlikely(isnew)) {
> > > wait_on_new_inode(old);
> > > if (unlikely(inode_unhashed(old))) {
> > > iput(old);
> > > goto again;
> > > }
> > >
> > > I expect this is going to go away after I post a patch to sanitize the
> > > current APIs for the hash.
> > >
> > >
> > > fs/afs/dir.c | 4 +-
> > > fs/dcache.c | 10 ----
> > > fs/gfs2/glock.c | 2 +-
> > > fs/inode.c | 146 +++++++++++++++++++++++++++------------------
> > > include/linux/fs.h | 12 +---
> > > 5 files changed, 93 insertions(+), 81 deletions(-)
> > >
> > > diff --git a/fs/afs/dir.c b/fs/afs/dir.c
> > > index 89d36e3e5c79..f4e9e12373ac 100644
> > > --- a/fs/afs/dir.c
> > > +++ b/fs/afs/dir.c
> > > @@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
> > > struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
> > > struct inode *inode = NULL, *ti;
> > > afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
> > > - bool supports_ibulk;
> > > + bool supports_ibulk, isnew;
> > > long ret;
> > > int i;
> > >
> > > @@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
> > > * callback counters.
> > > */
> > > ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
> > > - afs_ilookup5_test_by_fid, &vp->fid);
> > > + afs_ilookup5_test_by_fid, &vp->fid, &isnew);
> > > if (!IS_ERR_OR_NULL(ti)) {
> > > vnode = AFS_FS_I(ti);
> > > vp->dv_before = vnode->status.data_version;
> > > diff --git a/fs/dcache.c b/fs/dcache.c
> > > index 78ffa7b7e824..25131f105a60 100644
> > > --- a/fs/dcache.c
> > > +++ b/fs/dcache.c
> > > @@ -1981,17 +1981,7 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
> > > spin_lock(&inode->i_lock);
> > > __d_instantiate(entry, inode);
> > > WARN_ON(!(inode_state_read(inode) & I_NEW));
> > > - /*
> > > - * Pairs with smp_rmb in wait_on_inode().
> > > - */
> > > - smp_wmb();
> > > inode_state_clear(inode, I_NEW | I_CREATING);
> > > - /*
> > > - * Pairs with the barrier in prepare_to_wait_event() to make sure
> > > - * ___wait_var_event() either sees the bit cleared or
> > > - * waitqueue_active() check in wake_up_var() sees the waiter.
> > > - */
> > > - smp_mb();
> > > inode_wake_up_bit(inode, __I_NEW);
> > > spin_unlock(&inode->i_lock);
> > > }
> > > diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> > > index b677c0e6b9ab..c9712235e7a0 100644
> > > --- a/fs/gfs2/glock.c
> > > +++ b/fs/gfs2/glock.c
> > > @@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
> > > ip = NULL;
> > > spin_unlock(&gl->gl_lockref.lock);
> > > if (ip) {
> > > - wait_on_inode(&ip->i_inode);
> > > + wait_on_new_inode(&ip->i_inode);
> > > if (is_bad_inode(&ip->i_inode)) {
> > > iput(&ip->i_inode);
> > > ip = NULL;
> > > diff --git a/fs/inode.c b/fs/inode.c
> > > index 3153d725859c..1396f79b2551 100644
> > > --- a/fs/inode.c
> > > +++ b/fs/inode.c
> > > @@ -558,6 +558,32 @@ struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
> > > }
> > > EXPORT_SYMBOL(inode_bit_waitqueue);
> > >
> > > +void wait_on_new_inode(struct inode *inode)
> > > +{
> > > + struct wait_bit_queue_entry wqe;
> > > + struct wait_queue_head *wq_head;
> > > +
> > > + spin_lock(&inode->i_lock);
> > > + if (!(inode_state_read(inode) & I_NEW)) {
> > > + spin_unlock(&inode->i_lock);
> > > + return;
> > > + }
> > > +
> > > + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
> > > + for (;;) {
> > > + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
> > > + if (!(inode_state_read(inode) & I_NEW))
> > > + break;
> > > + spin_unlock(&inode->i_lock);
> > > + schedule();
> > > + spin_lock(&inode->i_lock);
> > > + }
> > > + finish_wait(wq_head, &wqe.wq_entry);
> > > + WARN_ON(inode_state_read(inode) & I_NEW);
> > > + spin_unlock(&inode->i_lock);
> > > +}
> > > +EXPORT_SYMBOL(wait_on_new_inode);
> > > +
> > > /*
> > > * Add inode to LRU if needed (inode is unused and clean).
> > > *
> > > @@ -1008,7 +1034,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
> > > static struct inode *find_inode(struct super_block *sb,
> > > struct hlist_head *head,
> > > int (*test)(struct inode *, void *),
> > > - void *data, bool is_inode_hash_locked)
> > > + void *data, bool is_inode_hash_locked,
> > > + bool *isnew)
> > > {
> > > struct inode *inode = NULL;
> > >
> > > @@ -1035,6 +1062,7 @@ static struct inode *find_inode(struct super_block *sb,
> > > return ERR_PTR(-ESTALE);
> > > }
> > > __iget(inode);
> > > + *isnew = !!(inode_state_read(inode) & I_NEW);
> > > spin_unlock(&inode->i_lock);
> > > rcu_read_unlock();
> > > return inode;
> > > @@ -1049,7 +1077,7 @@ static struct inode *find_inode(struct super_block *sb,
> > > */
> > > static struct inode *find_inode_fast(struct super_block *sb,
> > > struct hlist_head *head, unsigned long ino,
> > > - bool is_inode_hash_locked)
> > > + bool is_inode_hash_locked, bool *isnew)
> > > {
> > > struct inode *inode = NULL;
> > >
> > > @@ -1076,6 +1104,7 @@ static struct inode *find_inode_fast(struct super_block *sb,
> > > return ERR_PTR(-ESTALE);
> > > }
> > > __iget(inode);
> > > + *isnew = !!(inode_state_read(inode) & I_NEW);
> > > spin_unlock(&inode->i_lock);
> > > rcu_read_unlock();
> > > return inode;
> > > @@ -1181,17 +1210,7 @@ void unlock_new_inode(struct inode *inode)
> > > lockdep_annotate_inode_mutex_key(inode);
> > > spin_lock(&inode->i_lock);
> > > WARN_ON(!(inode_state_read(inode) & I_NEW));
> > > - /*
> > > - * Pairs with smp_rmb in wait_on_inode().
> > > - */
> > > - smp_wmb();
> > > inode_state_clear(inode, I_NEW | I_CREATING);
> > > - /*
> > > - * Pairs with the barrier in prepare_to_wait_event() to make sure
> > > - * ___wait_var_event() either sees the bit cleared or
> > > - * waitqueue_active() check in wake_up_var() sees the waiter.
> > > - */
> > > - smp_mb();
>
> You're getting rid of smp_mb() because you're rechecking the flag under
> i_lock after you called prepare_to_wait_event() in wait_on_new_inode()?
>
> > > inode_wake_up_bit(inode, __I_NEW);
> > > spin_unlock(&inode->i_lock);
> > > }
> > > @@ -1202,17 +1221,7 @@ void discard_new_inode(struct inode *inode)
> > > lockdep_annotate_inode_mutex_key(inode);
> > > spin_lock(&inode->i_lock);
> > > WARN_ON(!(inode_state_read(inode) & I_NEW));
> > > - /*
> > > - * Pairs with smp_rmb in wait_on_inode().
> > > - */
> > > - smp_wmb();
> > > inode_state_clear(inode, I_NEW);
> > > - /*
> > > - * Pairs with the barrier in prepare_to_wait_event() to make sure
> > > - * ___wait_var_event() either sees the bit cleared or
> > > - * waitqueue_active() check in wake_up_var() sees the waiter.
> > > - */
> > > - smp_mb();
> > > inode_wake_up_bit(inode, __I_NEW);
> > > spin_unlock(&inode->i_lock);
> > > iput(inode);
> > > @@ -1286,12 +1295,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
> > > {
> > > struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
> > > struct inode *old;
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > again:
> > > spin_lock(&inode_hash_lock);
> > > - old = find_inode(inode->i_sb, head, test, data, true);
> > > + old = find_inode(inode->i_sb, head, test, data, true, &isnew);
> > > if (unlikely(old)) {
> > > /*
> > > * Uhhuh, somebody else created the same inode under us.
> > > @@ -1300,10 +1310,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
> > > spin_unlock(&inode_hash_lock);
> > > if (IS_ERR(old))
> > > return NULL;
> > > - wait_on_inode(old);
> > > - if (unlikely(inode_unhashed(old))) {
> > > - iput(old);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(old);
> > > + if (unlikely(inode_unhashed(old))) {
> > > + iput(old);
> > > + goto again;
> > > + }
> > > }
> > > return old;
> > > }
> > > @@ -1391,18 +1403,21 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
> > > {
> > > struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> > > struct inode *inode, *new;
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > again:
> > > - inode = find_inode(sb, head, test, data, false);
> > > + inode = find_inode(sb, head, test, data, false, &isnew);
> > > if (inode) {
> > > if (IS_ERR(inode))
> > > return NULL;
> > > - wait_on_inode(inode);
> > > - if (unlikely(inode_unhashed(inode))) {
> > > - iput(inode);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(inode);
> > > + if (unlikely(inode_unhashed(inode))) {
> > > + iput(inode);
> > > + goto again;
> > > + }
> > > }
> > > return inode;
> > > }
> > > @@ -1434,18 +1449,21 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
> > > {
> > > struct hlist_head *head = inode_hashtable + hash(sb, ino);
> > > struct inode *inode;
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > again:
> > > - inode = find_inode_fast(sb, head, ino, false);
> > > + inode = find_inode_fast(sb, head, ino, false, &isnew);
> > > if (inode) {
> > > if (IS_ERR(inode))
> > > return NULL;
> > > - wait_on_inode(inode);
> > > - if (unlikely(inode_unhashed(inode))) {
> > > - iput(inode);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(inode);
> > > + if (unlikely(inode_unhashed(inode))) {
> > > + iput(inode);
> > > + goto again;
> > > + }
> > > }
> > > return inode;
> > > }
> > > @@ -1456,7 +1474,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
> > >
> > > spin_lock(&inode_hash_lock);
> > > /* We released the lock, so.. */
> > > - old = find_inode_fast(sb, head, ino, true);
> > > + old = find_inode_fast(sb, head, ino, true, &isnew);
> > > if (!old) {
> > > inode->i_ino = ino;
> > > spin_lock(&inode->i_lock);
> > > @@ -1482,10 +1500,12 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
> > > if (IS_ERR(old))
> > > return NULL;
> > > inode = old;
> > > - wait_on_inode(inode);
> > > - if (unlikely(inode_unhashed(inode))) {
> > > - iput(inode);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(inode);
> > > + if (unlikely(inode_unhashed(inode))) {
> > > + iput(inode);
> > > + goto again;
> > > + }
> > > }
> > > }
> > > return inode;
> > > @@ -1586,13 +1606,13 @@ EXPORT_SYMBOL(igrab);
> > > * Note2: @test is called with the inode_hash_lock held, so can't sleep.
> > > */
> > > struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
> > > - int (*test)(struct inode *, void *), void *data)
> > > + int (*test)(struct inode *, void *), void *data, bool *isnew)
> > > {
> > > struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> > > struct inode *inode;
> > >
> > > spin_lock(&inode_hash_lock);
> > > - inode = find_inode(sb, head, test, data, true);
> > > + inode = find_inode(sb, head, test, data, true, isnew);
> > > spin_unlock(&inode_hash_lock);
> > >
> > > return IS_ERR(inode) ? NULL : inode;
> > > @@ -1620,16 +1640,19 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
> > > int (*test)(struct inode *, void *), void *data)
> > > {
> > > struct inode *inode;
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > again:
> > > - inode = ilookup5_nowait(sb, hashval, test, data);
> > > + inode = ilookup5_nowait(sb, hashval, test, data, &isnew);
> > > if (inode) {
> > > - wait_on_inode(inode);
> > > - if (unlikely(inode_unhashed(inode))) {
> > > - iput(inode);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(inode);
> > > + if (unlikely(inode_unhashed(inode))) {
> > > + iput(inode);
> > > + goto again;
> > > + }
> > > }
> > > }
> > > return inode;
> > > @@ -1648,19 +1671,22 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
> > > {
> > > struct hlist_head *head = inode_hashtable + hash(sb, ino);
> > > struct inode *inode;
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > again:
> > > - inode = find_inode_fast(sb, head, ino, false);
> > > + inode = find_inode_fast(sb, head, ino, false, &isnew);
> > >
> > > if (inode) {
> > > if (IS_ERR(inode))
> > > return NULL;
> > > - wait_on_inode(inode);
> > > - if (unlikely(inode_unhashed(inode))) {
> > > - iput(inode);
> > > - goto again;
> > > + if (unlikely(isnew)) {
> > > + wait_on_new_inode(inode);
> > > + if (unlikely(inode_unhashed(inode))) {
> > > + iput(inode);
> > > + goto again;
> > > + }
> > > }
> > > }
> > > return inode;
> > > @@ -1800,6 +1826,7 @@ int insert_inode_locked(struct inode *inode)
> > > struct super_block *sb = inode->i_sb;
> > > ino_t ino = inode->i_ino;
> > > struct hlist_head *head = inode_hashtable + hash(sb, ino);
> > > + bool isnew;
> > >
> > > might_sleep();
> > >
> > > @@ -1832,12 +1859,15 @@ int insert_inode_locked(struct inode *inode)
> > > return -EBUSY;
> > > }
> > > __iget(old);
> > > + isnew = !!(inode_state_read(old) & I_NEW);
> > > spin_unlock(&old->i_lock);
> > > spin_unlock(&inode_hash_lock);
> > > - wait_on_inode(old);
> > > - if (unlikely(!inode_unhashed(old))) {
> > > - iput(old);
> > > - return -EBUSY;
> > > + if (isnew) {
> > > + wait_on_new_inode(old);
> > > + if (unlikely(!inode_unhashed(old))) {
> > > + iput(old);
> > > + return -EBUSY;
> > > + }
> > > }
> > > iput(old);
> > > }
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index 21c73df3ce75..a813abdcf218 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1030,15 +1030,7 @@ static inline void inode_fake_hash(struct inode *inode)
> > > hlist_add_fake(&inode->i_hash);
> > > }
> > >
> > > -static inline void wait_on_inode(struct inode *inode)
> > > -{
> > > - wait_var_event(inode_state_wait_address(inode, __I_NEW),
> > > - !(inode_state_read_once(inode) & I_NEW));
> > > - /*
> > > - * Pairs with routines clearing I_NEW.
> > > - */
> > > - smp_rmb();
> > > -}
> > > +void wait_on_new_inode(struct inode *inode);
> > >
> > > /*
> > > * inode->i_rwsem nesting subclasses for the lock validator:
> > > @@ -3417,7 +3409,7 @@ extern void d_mark_dontcache(struct inode *inode);
> > >
> > > extern struct inode *ilookup5_nowait(struct super_block *sb,
> > > unsigned long hashval, int (*test)(struct inode *, void *),
> > > - void *data);
> > > + void *data, bool *isnew);
> > > extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
> > > int (*test)(struct inode *, void *), void *data);
> > > extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
> > > --
> > > 2.34.1
> > >
Powered by blists - more mailing lists