[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250822-tagebuch-verteidigen-a0865d468f88@brauner>
Date: Fri, 22 Aug 2025 16:55:24 +0200
From: Christian Brauner <brauner@...nel.org>
To: Josef Bacik <josef@...icpanda.com>
Cc: linux-fsdevel@...r.kernel.org, linux-btrfs@...r.kernel.org,
kernel-team@...com, linux-ext4@...r.kernel.org, linux-xfs@...r.kernel.org,
viro@...iv.linux.org.uk
Subject: Re: [PATCH 14/50] fs: maintain a list of pinned inodes
On Thu, Aug 21, 2025 at 04:18:25PM -0400, Josef Bacik wrote:
> Currently we have relied on dirty inodes and inodes with cache on them
> to simply be left hanging around on the system outside of an LRU. The
> only way to make sure these inodes are eventually reclaimed is because
> dirty writeback will grab a reference on the inode and then iput it when
> it's done, potentially getting it on the LRU. For the cached case the
> page cache deletion path will call inode_add_lru when the inode no
> longer has cached pages in order to make sure the inode object can be
> freed eventually. In the unmount case we walk all inodes and free them
> so this all works out fine.
>
> But we want to eliminate 0 i_count objects as a concept, so we need a
> mechanism to hold a reference on these pinned inodes. To that end, add a
> list to the super block that contains any inodes that are cached for one
> reason or another.
>
> When we call inode_add_lru(), if the inode falls into one of these
> categories, we will add it to the cached inode list and hold an
> i_obj_count reference. If the inode does not fall into one of these
> categories it will be moved to the normal LRU, which is already holds an
> i_obj_count reference.
>
> The dirty case we will delete it from the LRU if it is on one, and then
> the iput after the writeout will make sure it's placed onto the correct
> list at that point.
>
> The page cache case will migrate it when it calls inode_add_lru() when
> deleting pages from the page cache.
>
> Signed-off-by: Josef Bacik <josef@...icpanda.com>
> ---
> fs/fs-writeback.c | 8 +++
> fs/inode.c | 102 +++++++++++++++++++++++++++++--
> fs/internal.h | 1 +
> fs/super.c | 3 +
> include/linux/fs.h | 11 ++++
> include/trace/events/writeback.h | 3 +-
> 6 files changed, 121 insertions(+), 7 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index d2e1fb1a0787..111a9d8215bf 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -2736,6 +2736,14 @@ static void wait_sb_inodes(struct super_block *sb)
> continue;
> }
> __iget(inode);
> +
> + /*
> + * We could have potentially ended up on the cached LRU list, so
> + * remove ourselves from this list now that we have a reference,
> + * the iput will handle placing it back on the appropriate LRU
> + * list if necessary.
> + */
> + inode_lru_list_del(inode);
> spin_unlock(&inode->i_lock);
> rcu_read_unlock();
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 94769b356224..adcba0a4d776 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -319,6 +319,23 @@ void free_inode_nonrcu(struct inode *inode)
> }
> EXPORT_SYMBOL(free_inode_nonrcu);
>
> +/*
> + * Some inodes need to stay pinned in memory because they are dirty or there are
> + * cached pages that the VM wants to keep around to avoid thrashing. This does
> + * the appropriate checks to see if we want to sheild this inode from periodic
> + * reclaim. Must be called with ->i_lock held.
> + */
> +static bool inode_needs_cached(struct inode *inode)
> +{
> + lockdep_assert_held(&inode->i_lock);
> +
> + if (inode->i_state & (I_DIRTY_ALL | I_SYNC))
> + return true;
> + if (!mapping_shrinkable(&inode->i_data))
> + return true;
> + return false;
> +}
> +
> static void i_callback(struct rcu_head *head)
> {
> struct inode *inode = container_of(head, struct inode, i_rcu);
> @@ -532,20 +549,67 @@ void ihold(struct inode *inode)
> }
> EXPORT_SYMBOL(ihold);
>
> +static void inode_add_cached_lru(struct inode *inode)
> +{
> + lockdep_assert_held(&inode->i_lock);
> +
> + if (inode->i_state & I_CACHED_LRU)
> + return;
> + if (!list_empty(&inode->i_lru))
> + return;
> +
> + inode->i_state |= I_CACHED_LRU;
> + spin_lock(&inode->i_sb->s_cached_inodes_lock);
> + list_add(&inode->i_lru, &inode->i_sb->s_cached_inodes);
> + spin_unlock(&inode->i_sb->s_cached_inodes_lock);
> + iobj_get(inode);
> +}
For mere correctness you likely want the iobj_get() before you're adding
it to the list so it ends up on the list with the i_obj_count already
bumped.
> +
> +static bool __inode_del_cached_lru(struct inode *inode)
> +{
> + lockdep_assert_held(&inode->i_lock);
> +
> + if (!(inode->i_state & I_CACHED_LRU))
> + return false;
> +
> + inode->i_state &= ~I_CACHED_LRU;
> + spin_lock(&inode->i_sb->s_cached_inodes_lock);
> + list_del_init(&inode->i_lru);
> + spin_unlock(&inode->i_sb->s_cached_inodes_lock);
> + return true;
> +}
> +
> +static bool inode_del_cached_lru(struct inode *inode)
> +{
> + if (__inode_del_cached_lru(inode)) {
> + iobj_put(inode);
> + return true;
> + }
> + return false;
> +}
> +
> static void __inode_add_lru(struct inode *inode, bool rotate)
> {
> - if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
> + bool need_ref = true;
> +
> + lockdep_assert_held(&inode->i_lock);
> +
> + if (inode->i_state & (I_FREEING | I_WILL_FREE))
> return;
> if (atomic_read(&inode->i_count))
Btw, we have a bunch of atomic_read() calls on i_count. We should really
have a helper for that just like we do for files. So we should add
icount() or something similarly named. Accessing reference counts
directly should ideally never happen...
> return;
> if (!(inode->i_sb->s_flags & SB_ACTIVE))
> return;
> - if (!mapping_shrinkable(&inode->i_data))
> + if (inode_needs_cached(inode)) {
> + inode_add_cached_lru(inode);
> return;
> + }
>
> + need_ref = __inode_del_cached_lru(inode) == false;
> if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) {
> - iobj_get(inode);
> inode->i_state |= I_LRU;
> + if (need_ref)
> + iobj_get(inode);
> this_cpu_inc(nr_unused);
> } else if (rotate) {
> inode->i_state |= I_REFERENCED;
> @@ -573,8 +637,19 @@ void inode_add_lru(struct inode *inode)
> __inode_add_lru(inode, false);
> }
>
> -static void inode_lru_list_del(struct inode *inode)
> +/*
> + * Caller must be holding it's own i_count reference on this inode in order to
> + * prevent this being the final iput.
> + *
> + * Needs inode->i_lock held.
> + */
> +void inode_lru_list_del(struct inode *inode)
> {
> + lockdep_assert_held(&inode->i_lock);
> +
> + if (inode_del_cached_lru(inode))
> + return;
> +
> if (!(inode->i_state & I_LRU))
> return;
>
> @@ -950,6 +1025,22 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> if (!spin_trylock(&inode->i_lock))
> return LRU_SKIP;
>
> + /*
> + * This inode is either dirty or has page cache we want to keep around,
> + * so move it to the cached list.
> + *
> + * We drop the extra i_obj_count reference we grab when adding it to the
> + * cached lru.
> + */
> + if (inode_needs_cached(inode)) {
> + list_lru_isolate(lru, &inode->i_lru);
> + inode_add_cached_lru(inode);
> + iobj_put(inode);
> + spin_unlock(&inode->i_lock);
> + this_cpu_dec(nr_unused);
> + return LRU_REMOVED;
> + }
> +
> /*
> * Inodes can get referenced, redirtied, or repopulated while
> * they're already on the LRU, and this can make them
> @@ -957,8 +1048,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> * sync, or the last page cache deletion will requeue them.
> */
> if (atomic_read(&inode->i_count) ||
> - (inode->i_state & ~I_REFERENCED) ||
> - !mapping_shrinkable(&inode->i_data)) {
> + (inode->i_state & ~I_REFERENCED)) {
> list_lru_isolate(lru, &inode->i_lru);
> inode->i_state &= ~I_LRU;
> spin_unlock(&inode->i_lock);
> diff --git a/fs/internal.h b/fs/internal.h
> index 38e8aab27bbd..17ecee7056d5 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -207,6 +207,7 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
> int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
> bool in_group_or_capable(struct mnt_idmap *idmap,
> const struct inode *inode, vfsgid_t vfsgid);
> +void inode_lru_list_del(struct inode *inode);
>
> /*
> * fs-writeback.c
> diff --git a/fs/super.c b/fs/super.c
> index a038848e8d1f..bf3e6d9055af 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -364,6 +364,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
> spin_lock_init(&s->s_inode_list_lock);
> INIT_LIST_HEAD(&s->s_inodes_wb);
> spin_lock_init(&s->s_inode_wblist_lock);
> + INIT_LIST_HEAD(&s->s_cached_inodes);
> + spin_lock_init(&s->s_cached_inodes_lock);
>
> s->s_count = 1;
> atomic_set(&s->s_active, 1);
> @@ -409,6 +411,7 @@ static void __put_super(struct super_block *s)
> WARN_ON(s->s_dentry_lru.node);
> WARN_ON(s->s_inode_lru.node);
> WARN_ON(!list_empty(&s->s_mounts));
> + WARN_ON(!list_empty(&s->s_cached_inodes));
> call_rcu(&s->rcu, destroy_super_rcu);
> }
> }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 509e696a4df0..8384ed81a5ad 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -749,6 +749,9 @@ is_uncached_acl(struct posix_acl *acl)
> * ->i_lru is on the LRU and those that are using ->i_lru
> * for some other means.
> *
> + * I_CACHED_LRU Inode is cached because it is dirty or isn't shrinkable,
> + * and thus is on the s_cached_inode_lru list.
> + *
> * Q: What is the difference between I_WILL_FREE and I_FREEING?
> *
> * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
> @@ -786,6 +789,7 @@ enum inode_state_bits {
> INODE_BIT(I_SYNC_QUEUED),
> INODE_BIT(I_PINNING_NETFS_WB),
> INODE_BIT(I_LRU),
> + INODE_BIT(I_CACHED_LRU),
> };
>
> #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
> @@ -1584,6 +1588,13 @@ struct super_block {
>
> spinlock_t s_inode_wblist_lock;
> struct list_head s_inodes_wb; /* writeback inodes */
> +
> + /*
> + * Cached inodes, any inodes that their reference is held by another
> + * mechanism, such as dirty inodes or unshrinkable inodes.
> + */
> + spinlock_t s_cached_inodes_lock;
> + struct list_head s_cached_inodes;
> } __randomize_layout;
>
> static inline struct user_namespace *i_user_ns(const struct inode *inode)
> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> index 486f85aca84d..6949329c744a 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -29,7 +29,8 @@
> {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \
> {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \
> {I_LRU_ISOLATING, "I_LRU_ISOLATING"}, \
> - {I_LRU, "I_LRU"} \
> + {I_LRU, "I_LRU"}, \
> + {I_CACHED_LRU, "I_CACHED_LRU"} \
> )
>
> /* enums need to be exported to user space */
> --
> 2.49.0
>
Powered by blists - more mailing lists