lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220331190827.48241-3-stephen.s.brennan@oracle.com>
Date:   Thu, 31 Mar 2022 12:08:27 -0700
From:   Stephen Brennan <stephen.s.brennan@...cle.com>
To:     Alexander Viro <viro@...iv.linux.org.uk>
Cc:     Andrew Morton <akpm@...ux-foundation.org>,
        Luis Chamberlain <mcgrof@...nel.org>,
        Stephen Brennan <stephen.s.brennan@...cle.com>,
        Arnd Bergmann <arnd@...db.de>,
        Matthew Wilcox <willy@...radead.org>,
        James Bottomley <James.Bottomley@...senPartnership.com>,
        Gao Xiang <hsiangkao@...ux.alibaba.com>,
        Dave Chinner <david@...morbit.com>,
        Roman Gushchin <roman.gushchin@...ux.dev>,
        Colin Walters <walters@...bum.org>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [RFC PATCH 2/2] fs/dcache: Add negative-dentry-ratio config

Negative dentry bloat is a well-known problem. For systems without
memory pressure, some workloads (like repeated stat calls) can create an
unbounded amount of negative dentries quite quickly. In the best case,
these dentries could speed up a subsequent name lookup, but in the worst
case, they are never used and their memory never freed.

While systems without memory pressure may not need that memory for other
purposes, negative dentry bloat can have other side-effects, such as
soft lockups when traversing the d_subdirs list or general slowness with
managing them. It is a good idea to have some sort of mechanism for
controlling negative dentries, even outside memory pressure.

This patch attempts to do so in a fair way. Workloads which create many
negative dentries must create many dentries, or convert dentries from
positive to negative. Thus, negative dentry management is best done
during these same operations, as it will amortize its cost, and
distribute the cost to the perpetrators of the dentry bloat. We
introduce a sysctl "negative-dentry-ratio" which sets a maximum number
of negative dentries per positive dentry, N:1. When a dentry is created
or unlinked, the next N+1 dentries of the parent are scanned. If no
positive dentries are found, then a candidate negative dentry is killed.

Signed-off-by: Stephen Brennan <stephen.s.brennan@...cle.com>
---
 fs/dcache.c            | 93 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/dcache.h |  1 +
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index b1480433ddc5..ba79107a8268 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -74,6 +74,8 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
+unsigned int sysctl_negative_dentry_ratio = 5;
+
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
@@ -183,6 +185,7 @@ static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
+const unsigned int max_negative_dentry_ratio = 20;
 static struct ctl_table fs_dcache_sysctls[] = {
 	{
 		.procname	= "dentry-state",
@@ -191,6 +194,15 @@ static struct ctl_table fs_dcache_sysctls[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_nr_dentry,
 	},
+	{
+		.procname	= "negative-dentry-ratio",
+		.data		= &sysctl_negative_dentry_ratio,
+		.maxlen	= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1	= SYSCTL_ZERO,
+		.extra2	= (void *)&max_negative_dentry_ratio,
+	},
 	{ }
 };
 
@@ -547,6 +559,8 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
 	if (unlikely(list_empty(&dentry->d_child)))
 		return;
+	if (!IS_ROOT(dentry) && unlikely(dentry->d_parent->d_scan_cursor == &dentry->d_child))
+		dentry->d_parent->d_scan_cursor = dentry->d_child.next;
 	__list_del_entry(&dentry->d_child);
 	/*
 	 * Cursors can move around the list of children.  While we'd been
@@ -1751,6 +1765,71 @@ void d_invalidate(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_invalidate);
 
+/**
+ * Enforce a requirement that a directory never contains more than N negative
+ * dentries per positive dentry. Scan N + 1 dentries. If no positive dentries
+ * are found, then kill one negative dentry. parent's lock must be held, and it
+ * will be released by this function.
+ */
+static void dentry_incremental_scan(struct dentry *parent)
+{
+	struct dentry *dentry, *candidate = NULL;
+	struct list_head *cursor, *start;
+	unsigned int flags;
+	int refcount;
+	int nr_to_scan = sysctl_negative_dentry_ratio + 1;
+	int nr_positive = 0;
+
+	if (nr_to_scan == 1)
+		goto out_unlock;
+
+	cursor = start = parent->d_scan_cursor ?: parent->d_subdirs.next;
+
+	rcu_read_lock();
+	while (nr_to_scan--) {
+		if (cursor == &parent->d_subdirs)
+			goto next;
+		dentry = container_of(cursor, struct dentry, d_child);
+		flags = READ_ONCE(dentry->d_flags);
+		refcount = READ_ONCE(dentry->d_lockref.count);
+		if (d_flags_negative(flags)) {
+			if (!refcount && !dentry->d_inode && !candidate)
+				candidate = dentry;
+		} else {
+			nr_positive++;
+		}
+	next:
+		cursor = cursor->next;
+		if (cursor == start) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+	}
+
+	parent->d_scan_cursor = cursor;
+	if (nr_positive || !candidate) {
+		rcu_read_unlock();
+		goto out_unlock;
+	}
+
+	spin_lock(&candidate->d_lock);
+	rcu_read_unlock();
+
+	/* Need to re-read candidate's flags and inode. */
+	if (!d_is_negative(candidate) || candidate->d_inode ||
+	    candidate->d_lockref.count) {
+		spin_unlock(&candidate->d_lock);
+		goto out_unlock;
+	}
+	/* No need to cond_resched(); we're not repeating this operation */
+	__dentry_kill(candidate, false);
+	/* parent->d_lock is now unlocked */
+	return;
+
+out_unlock:
+	spin_unlock(&parent->d_lock);
+}
+
 /**
  * __d_alloc	-	allocate a dcache entry
  * @sb: filesystem it will belong to
@@ -1814,6 +1893,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_sb = sb;
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
+	dentry->d_scan_cursor = NULL;
 	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1858,7 +1938,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	__dget_dlock(parent);
 	dentry->d_parent = parent;
 	list_add(&dentry->d_child, &parent->d_subdirs);
-	spin_unlock(&parent->d_lock);
+	dentry_incremental_scan(parent); /* unlocks parent */
 
 	return dentry;
 }
@@ -2521,6 +2601,7 @@ EXPORT_SYMBOL(d_hash_and_lookup);
 void d_delete(struct dentry * dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
 
 	spin_lock(&inode->i_lock);
 	spin_lock(&dentry->d_lock);
@@ -2529,7 +2610,17 @@ void d_delete(struct dentry * dentry)
 	 */
 	if (dentry->d_lockref.count == 1) {
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
+		if (!IS_ROOT(dentry))
+			parent = dentry->d_parent;
 		dentry_unlink_inode(dentry);
+		/*
+		 * Since we have created a negative dentry, continue the
+		 * incremental scan to keep enforcing negative dentry ratio.
+		 */
+		if (parent) {
+			spin_lock(&parent->d_lock);
+			dentry_incremental_scan(parent); /* unlocks parent */
+		}
 	} else {
 		__d_drop(dentry);
 		spin_unlock(&dentry->d_lock);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f5bba51480b2..59c240d8c493 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -102,6 +102,7 @@ struct dentry {
 	};
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
+	struct list_head *d_scan_cursor;
 	/*
 	 * d_alias and d_rcu can share memory
 	 */
-- 
2.30.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ