[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d2xotmpncid4rlsahjm7lsszqjrgn6kgtta4w5flvsawcp6guu@neqoz2ayoccm>
Date: Tue, 25 Nov 2025 12:26:30 +0100
From: Jan Kara <jack@...e.cz>
To: Mateusz Guzik <mjguzik@...il.com>
Cc: brauner@...nel.org, viro@...iv.linux.org.uk, jack@...e.cz,
linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH v2] fs: scale opening of character devices
On Fri 21-11-25 08:28:18, Mateusz Guzik wrote:
> chrdev_open() always takes cdev_lock, which is only needed to synchronize
> against cd_forget(). But the latter is only ever called by inode evict(),
> meaning these two can never legally race. Solidify this with asserts.
Are you sure? Because from a quick glance it doesn't seem that inodes hold
a refcount of the cdev. Thus inode->i_cdev can freed if you don't hold
cdev_lock - it is only the cdev_lock that makes cdev_get() safe against UAF
issues in chrdev_open() AFAICS because that blocks cdev_purge() from
completing when last ref of the kobject is dropped...
Honza
>
> More cleanups are needed here but this is enough to get the thing out of
> the way.
>
> Rationale is funny-sounding at first: opening of /dev/zero happens to be
> a contention point in large-scale package building (think 100+ packages
> at the same with a thread count to support it). Such a workload is not
> only very fork+exec heavy, but frequently involves scripts which use the
> idiom of silencing output by redirecting it to /dev/null.
>
> A non-large-scale microbenchmark of opening /dev/null in a loop in 16
> processes:
> before: 2865472
> after: 4011960 (+40%)
>
> Code goes from being bottlenecked on the spinlock to being bottlenecked
> on lockref.
>
> Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> ---
>
> v2:
> - add back new = NULL lost in refactoring
>
> I'll note for interested my experience with the workload at hand comes
> from FreeBSD and was surprised to find /dev/null on the profile. Given
> that Linux is globally serializing on it, it has to be a factor as well
> in this case.
>
> fs/char_dev.c | 20 +++++++++++---------
> fs/inode.c | 2 +-
> include/linux/cdev.h | 2 +-
> 3 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/fs/char_dev.c b/fs/char_dev.c
> index c2ddb998f3c9..9a6dfab084d1 100644
> --- a/fs/char_dev.c
> +++ b/fs/char_dev.c
> @@ -374,15 +374,15 @@ static int chrdev_open(struct inode *inode, struct file *filp)
> {
> const struct file_operations *fops;
> struct cdev *p;
> - struct cdev *new = NULL;
> int ret = 0;
>
> - spin_lock(&cdev_lock);
> - p = inode->i_cdev;
> + VFS_BUG_ON_INODE(icount_read(inode) < 1, inode);
> +
> + p = READ_ONCE(inode->i_cdev);
> if (!p) {
> struct kobject *kobj;
> + struct cdev *new = NULL;
> int idx;
> - spin_unlock(&cdev_lock);
> kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
> if (!kobj)
> return -ENXIO;
> @@ -392,19 +392,19 @@ static int chrdev_open(struct inode *inode, struct file *filp)
> we dropped the lock. */
> p = inode->i_cdev;
> if (!p) {
> - inode->i_cdev = p = new;
> + p = new;
> + WRITE_ONCE(inode->i_cdev, p);
> list_add(&inode->i_devices, &p->list);
> new = NULL;
> } else if (!cdev_get(p))
> ret = -ENXIO;
> + spin_unlock(&cdev_lock);
> + cdev_put(new);
> } else if (!cdev_get(p))
> ret = -ENXIO;
> - spin_unlock(&cdev_lock);
> - cdev_put(new);
> if (ret)
> return ret;
>
> - ret = -ENXIO;
> fops = fops_get(p->ops);
> if (!fops)
> goto out_cdev_put;
> @@ -423,8 +423,10 @@ static int chrdev_open(struct inode *inode, struct file *filp)
> return ret;
> }
>
> -void cd_forget(struct inode *inode)
> +void inode_cdev_forget(struct inode *inode)
> {
> + VFS_BUG_ON_INODE(!(inode_state_read_once(inode) & I_FREEING), inode);
> +
> spin_lock(&cdev_lock);
> list_del_init(&inode->i_devices);
> inode->i_cdev = NULL;
> diff --git a/fs/inode.c b/fs/inode.c
> index a62032864ddf..88be1f20782d 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -840,7 +840,7 @@ static void evict(struct inode *inode)
> clear_inode(inode);
> }
> if (S_ISCHR(inode->i_mode) && inode->i_cdev)
> - cd_forget(inode);
> + inode_cdev_forget(inode);
>
> remove_inode_hash(inode);
>
> diff --git a/include/linux/cdev.h b/include/linux/cdev.h
> index 0e8cd6293deb..bed99967ad90 100644
> --- a/include/linux/cdev.h
> +++ b/include/linux/cdev.h
> @@ -34,6 +34,6 @@ void cdev_device_del(struct cdev *cdev, struct device *dev);
>
> void cdev_del(struct cdev *);
>
> -void cd_forget(struct inode *);
> +void inode_cdev_forget(struct inode *);
>
> #endif
> --
> 2.48.1
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists