lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250822-tagebuch-verteidigen-a0865d468f88@brauner>
Date: Fri, 22 Aug 2025 16:55:24 +0200
From: Christian Brauner <brauner@...nel.org>
To: Josef Bacik <josef@...icpanda.com>
Cc: linux-fsdevel@...r.kernel.org, linux-btrfs@...r.kernel.org, 
	kernel-team@...com, linux-ext4@...r.kernel.org, linux-xfs@...r.kernel.org, 
	viro@...iv.linux.org.uk
Subject: Re: [PATCH 14/50] fs: maintain a list of pinned inodes

On Thu, Aug 21, 2025 at 04:18:25PM -0400, Josef Bacik wrote:
> Currently we have relied on dirty inodes and inodes with cache on them
> to simply be left hanging around on the system outside of an LRU. The
> only way to make sure these inodes are eventually reclaimed is because
> dirty writeback will grab a reference on the inode and then iput it when
> it's done, potentially getting it on the LRU. For the cached case the
> page cache deletion path will call inode_add_lru when the inode no
> longer has cached pages in order to make sure the inode object can be
> freed eventually.  In the unmount case we walk all inodes and free them
> so this all works out fine.
> 
> But we want to eliminate 0 i_count objects as a concept, so we need a
> mechanism to hold a reference on these pinned inodes. To that end, add a
> list to the super block that contains any inodes that are cached for one
> reason or another.
> 
> When we call inode_add_lru(), if the inode falls into one of these
> categories, we will add it to the cached inode list and hold an
> i_obj_count reference.  If the inode does not fall into one of these
> categories it will be moved to the normal LRU, which is already holds an
> i_obj_count reference.
> 
> The dirty case we will delete it from the LRU if it is on one, and then
> the iput after the writeout will make sure it's placed onto the correct
> list at that point.
> 
> The page cache case will migrate it when it calls inode_add_lru() when
> deleting pages from the page cache.
> 
> Signed-off-by: Josef Bacik <josef@...icpanda.com>
> ---
>  fs/fs-writeback.c                |   8 +++
>  fs/inode.c                       | 102 +++++++++++++++++++++++++++++--
>  fs/internal.h                    |   1 +
>  fs/super.c                       |   3 +
>  include/linux/fs.h               |  11 ++++
>  include/trace/events/writeback.h |   3 +-
>  6 files changed, 121 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index d2e1fb1a0787..111a9d8215bf 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -2736,6 +2736,14 @@ static void wait_sb_inodes(struct super_block *sb)
>  			continue;
>  		}
>  		__iget(inode);
> +
> +		/*
> +		 * We could have potentially ended up on the cached LRU list, so
> +		 * remove ourselves from this list now that we have a reference,
> +		 * the iput will handle placing it back on the appropriate LRU
> +		 * list if necessary.
> +		 */
> +		inode_lru_list_del(inode);
>  		spin_unlock(&inode->i_lock);
>  		rcu_read_unlock();
>  
> diff --git a/fs/inode.c b/fs/inode.c
> index 94769b356224..adcba0a4d776 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -319,6 +319,23 @@ void free_inode_nonrcu(struct inode *inode)
>  }
>  EXPORT_SYMBOL(free_inode_nonrcu);
>  
> +/*
> + * Some inodes need to stay pinned in memory because they are dirty or there are
> + * cached pages that the VM wants to keep around to avoid thrashing. This does
> + * the appropriate checks to see if we want to sheild this inode from periodic
> + * reclaim. Must be called with ->i_lock held.
> + */
> +static bool inode_needs_cached(struct inode *inode)
> +{
> +	lockdep_assert_held(&inode->i_lock);
> +
> +	if (inode->i_state & (I_DIRTY_ALL | I_SYNC))
> +		return true;
> +	if (!mapping_shrinkable(&inode->i_data))
> +		return true;
> +	return false;
> +}
> +
>  static void i_callback(struct rcu_head *head)
>  {
>  	struct inode *inode = container_of(head, struct inode, i_rcu);
> @@ -532,20 +549,67 @@ void ihold(struct inode *inode)
>  }
>  EXPORT_SYMBOL(ihold);
>  
> +static void inode_add_cached_lru(struct inode *inode)
> +{
> +	lockdep_assert_held(&inode->i_lock);
> +
> +	if (inode->i_state & I_CACHED_LRU)
> +		return;
> +	if (!list_empty(&inode->i_lru))
> +		return;
> +
> +	inode->i_state |= I_CACHED_LRU;
> +	spin_lock(&inode->i_sb->s_cached_inodes_lock);
> +	list_add(&inode->i_lru, &inode->i_sb->s_cached_inodes);
> +	spin_unlock(&inode->i_sb->s_cached_inodes_lock);
> +	iobj_get(inode);
> +}

For mere correctness you likely want the iobj_get() before you're adding
it to the list so it ends up on the list with the i_obj_count already
bumped.

> +
> +static bool __inode_del_cached_lru(struct inode *inode)
> +{
> +	lockdep_assert_held(&inode->i_lock);
> +
> +	if (!(inode->i_state & I_CACHED_LRU))
> +		return false;
> +
> +	inode->i_state &= ~I_CACHED_LRU;
> +	spin_lock(&inode->i_sb->s_cached_inodes_lock);
> +	list_del_init(&inode->i_lru);
> +	spin_unlock(&inode->i_sb->s_cached_inodes_lock);
> +	return true;
> +}
> +
> +static bool inode_del_cached_lru(struct inode *inode)
> +{
> +	if (__inode_del_cached_lru(inode)) {
> +		iobj_put(inode);
> +		return true;
> +	}
> +	return false;
> +}
> +
>  static void __inode_add_lru(struct inode *inode, bool rotate)
>  {
> -	if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
> +	bool need_ref = true;
> +
> +	lockdep_assert_held(&inode->i_lock);
> +
> +	if (inode->i_state & (I_FREEING | I_WILL_FREE))
>  		return;
>  	if (atomic_read(&inode->i_count))

Btw, we have a bunch of atomic_read() calls on i_count. We should really
have a helper for that just like we do for files. So we should add
icount() or something similarly named. Accessing reference counts
directly should ideally never happen...

>  		return;
>  	if (!(inode->i_sb->s_flags & SB_ACTIVE))
>  		return;
> -	if (!mapping_shrinkable(&inode->i_data))
> +	if (inode_needs_cached(inode)) {
> +		inode_add_cached_lru(inode);
>  		return;
> +	}
>  
> +	need_ref = __inode_del_cached_lru(inode) == false;
>  	if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) {
> -		iobj_get(inode);
>  		inode->i_state |= I_LRU;
> +		if (need_ref)
> +			iobj_get(inode);
>  		this_cpu_inc(nr_unused);
>  	} else if (rotate) {
>  		inode->i_state |= I_REFERENCED;
> @@ -573,8 +637,19 @@ void inode_add_lru(struct inode *inode)
>  	__inode_add_lru(inode, false);
>  }
>  
> -static void inode_lru_list_del(struct inode *inode)
> +/*
> + * Caller must be holding it's own i_count reference on this inode in order to
> + * prevent this being the final iput.
> + *
> + * Needs inode->i_lock held.
> + */
> +void inode_lru_list_del(struct inode *inode)
>  {
> +	lockdep_assert_held(&inode->i_lock);
> +
> +	if (inode_del_cached_lru(inode))
> +		return;
> +
>  	if (!(inode->i_state & I_LRU))
>  		return;
>  
> @@ -950,6 +1025,22 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>  	if (!spin_trylock(&inode->i_lock))
>  		return LRU_SKIP;
>  
> +	/*
> +	 * This inode is either dirty or has page cache we want to keep around,
> +	 * so move it to the cached list.
> +	 *
> +	 * We drop the extra i_obj_count reference we grab when adding it to the
> +	 * cached lru.
> +	 */
> +	if (inode_needs_cached(inode)) {
> +		list_lru_isolate(lru, &inode->i_lru);
> +		inode_add_cached_lru(inode);
> +		iobj_put(inode);
> +		spin_unlock(&inode->i_lock);
> +		this_cpu_dec(nr_unused);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Inodes can get referenced, redirtied, or repopulated while
>  	 * they're already on the LRU, and this can make them
> @@ -957,8 +1048,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>  	 * sync, or the last page cache deletion will requeue them.
>  	 */
>  	if (atomic_read(&inode->i_count) ||
> -	    (inode->i_state & ~I_REFERENCED) ||
> -	    !mapping_shrinkable(&inode->i_data)) {
> +	    (inode->i_state & ~I_REFERENCED)) {
>  		list_lru_isolate(lru, &inode->i_lru);
>  		inode->i_state &= ~I_LRU;
>  		spin_unlock(&inode->i_lock);
> diff --git a/fs/internal.h b/fs/internal.h
> index 38e8aab27bbd..17ecee7056d5 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -207,6 +207,7 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
>  int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
>  bool in_group_or_capable(struct mnt_idmap *idmap,
>  			 const struct inode *inode, vfsgid_t vfsgid);
> +void inode_lru_list_del(struct inode *inode);
>  
>  /*
>   * fs-writeback.c
> diff --git a/fs/super.c b/fs/super.c
> index a038848e8d1f..bf3e6d9055af 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -364,6 +364,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  	spin_lock_init(&s->s_inode_list_lock);
>  	INIT_LIST_HEAD(&s->s_inodes_wb);
>  	spin_lock_init(&s->s_inode_wblist_lock);
> +	INIT_LIST_HEAD(&s->s_cached_inodes);
> +	spin_lock_init(&s->s_cached_inodes_lock);
>  
>  	s->s_count = 1;
>  	atomic_set(&s->s_active, 1);
> @@ -409,6 +411,7 @@ static void __put_super(struct super_block *s)
>  		WARN_ON(s->s_dentry_lru.node);
>  		WARN_ON(s->s_inode_lru.node);
>  		WARN_ON(!list_empty(&s->s_mounts));
> +		WARN_ON(!list_empty(&s->s_cached_inodes));
>  		call_rcu(&s->rcu, destroy_super_rcu);
>  	}
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 509e696a4df0..8384ed81a5ad 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -749,6 +749,9 @@ is_uncached_acl(struct posix_acl *acl)
>   *			->i_lru is on the LRU and those that are using ->i_lru
>   *			for some other means.
>   *
> + * I_CACHED_LRU		Inode is cached because it is dirty or isn't shrinkable,
> + *			and thus is on the s_cached_inode_lru list.
> + *
>   * Q: What is the difference between I_WILL_FREE and I_FREEING?
>   *
>   * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
> @@ -786,6 +789,7 @@ enum inode_state_bits {
>  	INODE_BIT(I_SYNC_QUEUED),
>  	INODE_BIT(I_PINNING_NETFS_WB),
>  	INODE_BIT(I_LRU),
> +	INODE_BIT(I_CACHED_LRU),
>  };
>  
>  #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
> @@ -1584,6 +1588,13 @@ struct super_block {
>  
>  	spinlock_t		s_inode_wblist_lock;
>  	struct list_head	s_inodes_wb;	/* writeback inodes */
> +
> +	/*
> +	 * Cached inodes, any inodes that their reference is held by another
> +	 * mechanism, such as dirty inodes or unshrinkable inodes.
> +	 */
> +	spinlock_t		s_cached_inodes_lock;
> +	struct list_head	s_cached_inodes;
>  } __randomize_layout;
>  
>  static inline struct user_namespace *i_user_ns(const struct inode *inode)
> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> index 486f85aca84d..6949329c744a 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -29,7 +29,8 @@
>  		{I_SYNC_QUEUED,		"I_SYNC_QUEUED"},	\
>  		{I_PINNING_NETFS_WB,	"I_PINNING_NETFS_WB"},	\
>  		{I_LRU_ISOLATING,	"I_LRU_ISOLATING"},	\
> -		{I_LRU,			"I_LRU"}		\
> +		{I_LRU,			"I_LRU"},		\
> +		{I_CACHED_LRU,		"I_CACHED_LRU"}		\
>  	)
>  
>  /* enums need to be exported to user space */
> -- 
> 2.49.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ