lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211214005337.161885-2-stephen.s.brennan@oracle.com>
Date:   Mon, 13 Dec 2021 16:53:34 -0800
From:   Stephen Brennan <stephen.s.brennan@...cle.com>
To:     Alexander Viro <viro@...iv.linux.org.uk>
Cc:     Stephen Brennan <stephen.s.brennan@...cle.com>,
        Gautham Ananthakrishna <gautham.ananthakrishna@...cle.com>,
        Konstantin Khlebnikov <khlebnikov@...dex-team.ru>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 1/4] dcache: sweep cached negative dentries to the end of list of siblings

For disk filesystems result of every negative lookup is cached, content
of directories is usually cached too. Production of negative dentries
isn't limited with disk speed. It's really easy to generate millions of
them if system has enough memory. Negative dentries are linked into
siblings list along with normal positive dentries. Some operations walks
dcache tree but looks only for positive dentries: most important is
fsnotify/inotify.

This patch sweeps negative dentries to the end of list at final dput()
and marks with flag which tells that all following dentries are negative
too. We do this carefully to avoid corruption in case the dentry is
killed when we try to lock its parent. Reverse operation (recycle) is
required before instantiating tail negative dentry, or calling d_add()
with non-NULL inode.

Co-authored-by: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
Co-authored-by: Gautham Ananthakrishna <gautham.ananthakrishna@...cle.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@...cle.com>
---

Previously Al was concerned about sweep_negative() calling lock_parent() 
when it held no reference, so there was no guarantee that dentry would 
stay allocated. I've addressed this by adding an RCU critical section 
into sweep_negative(), ensuring that all writes to dentry occurs within 
the outer critical section. Thus, the dentry could not be freed yet. By 
checking d_count once the lock is regained, we can be sure that the 
dentry has not yet been killed.

This is not necessary for recycle_negative(), since it is called by 
d_add and d_instantiate(), both of whose callers should hold a 
reference.

The other major concern for this patch was that not all ways for a 
negative dentry to be converted to positive were caught by 
recycle_negative(). I've moved the call into __d_instantiate(), and 
added a call for __d_add().

 fs/dcache.c            | 85 +++++++++++++++++++++++++++++++++++++++---
 include/linux/dcache.h |  6 +++
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index cf871a81f4fd..685b91866e59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -635,6 +635,58 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 	return __lock_parent(dentry);
 }
 
+/*
+ * Move cached negative dentry to the tail of parent->d_subdirs.
+ * This lets walkers skip them all together at first sight.
+ * Must be called at dput of negative dentry with d_lock held.
+ * Releases d_lock.
+ */
+static void sweep_negative(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	rcu_read_lock();
+	parent = lock_parent(dentry);
+	if (!parent) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/*
+	 * If we did not hold a reference to dentry (as in the case of dput),
+	 * and dentry->d_lock was dropped in lock_parent(), then we could now be
+	 * holding onto a dead dentry. Be careful to check d_count and unlock
+	 * before dropping RCU lock, otherwise we could corrupt freed memory.
+	 */
+	if (!d_count(dentry) && d_is_negative(dentry) &&
+		!d_is_tail_negative(dentry)) {
+		dentry->d_flags |= DCACHE_TAIL_NEGATIVE;
+		list_move_tail(&dentry->d_child, &parent->d_subdirs);
+	}
+
+	spin_unlock(&parent->d_lock);
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+}
+
+/*
+ * Undo sweep_negative() and move to the head of parent->d_subdirs.
+ * Must be called before converting negative dentry into positive.
+ * Must hold dentry->d_lock, we may drop and re-grab it via lock_parent().
+ * Must be hold a reference or be otherwise sure the dentry cannot be killed.
+ */
+static void recycle_negative(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	parent = lock_parent(dentry);
+	dentry->d_flags &= ~DCACHE_TAIL_NEGATIVE;
+	if (parent) {
+		list_move(&dentry->d_child, &parent->d_subdirs);
+		spin_unlock(&parent->d_lock);
+	}
+}
+
 static inline bool retain_dentry(struct dentry *dentry)
 {
 	WARN_ON(d_in_lookup(dentry));
@@ -740,7 +792,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 static inline bool fast_dput(struct dentry *dentry)
 {
 	int ret;
-	unsigned int d_flags;
+	unsigned int d_flags, required;
 
 	/*
 	 * If we have a d_op->d_delete() operation, we sould not
@@ -788,6 +840,8 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * a 'delete' op, and it's referenced and already on
 	 * the LRU list.
 	 *
+	 * Cached negative dentry must be swept to the tail.
+	 *
 	 * NOTE! Since we aren't locked, these values are
 	 * not "stable". However, it is sufficient that at
 	 * some point after we dropped the reference the
@@ -805,11 +859,16 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
+
+	required = DCACHE_REFERENCED | DCACHE_LRU_LIST |
+		(d_flags_negative(d_flags) ? DCACHE_TAIL_NEGATIVE : 0);
+
 	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
-			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
+			DCACHE_DISCONNECTED | DCACHE_DONTCACHE |
+			DCACHE_TAIL_NEGATIVE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
-	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+	if (d_flags == required && !d_unhashed(dentry))
 		return true;
 
 	/*
@@ -881,7 +940,10 @@ void dput(struct dentry *dentry)
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
-			spin_unlock(&dentry->d_lock);
+			if (d_is_negative(dentry) && !d_is_tail_negative(dentry))
+				sweep_negative(dentry); /* drops d_lock */
+			else
+				spin_unlock(&dentry->d_lock);
 			return;
 		}
 
@@ -1973,6 +2035,8 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	WARN_ON(d_in_lookup(dentry));
 
 	spin_lock(&dentry->d_lock);
+	if (d_is_tail_negative(dentry))
+		recycle_negative(dentry);
 	/*
 	 * Decrement negative dentry count if it was in the LRU list.
 	 */
@@ -2697,6 +2761,13 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 	struct inode *dir = NULL;
 	unsigned n;
 	spin_lock(&dentry->d_lock);
+	/*
+	 * Tail negative dentries must become positive before associating an
+	 * inode. recycle_negative() is safe to use because we hold a reference
+	 * to dentry.
+	 */
+	if (inode && d_is_tail_negative(dentry))
+		recycle_negative(dentry);
 	if (unlikely(d_in_lookup(dentry))) {
 		dir = dentry->d_parent->d_inode;
 		n = start_dir_add(dir);
@@ -2713,7 +2784,11 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 	__d_rehash(dentry);
 	if (dir)
 		end_dir_add(dir, n);
-	spin_unlock(&dentry->d_lock);
+
+	if (!inode && !d_is_tail_negative(dentry))
+		sweep_negative(dentry); /* drops d_lock */
+	else
+		spin_unlock(&dentry->d_lock);
 	if (inode)
 		spin_unlock(&inode->i_lock);
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9e23d33bb6f1..b877c9ca212f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -221,6 +221,7 @@ struct dentry_operations {
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
 #define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+#define DCACHE_TAIL_NEGATIVE		0x80000000 /* All following siblings are negative */
 
 extern seqlock_t rename_lock;
 
@@ -499,6 +500,11 @@ static inline int simple_positive(const struct dentry *dentry)
 	return d_really_is_positive(dentry) && !d_unhashed(dentry);
 }
 
+static inline bool d_is_tail_negative(const struct dentry *dentry)
+{
+	return unlikely(dentry->d_flags & DCACHE_TAIL_NEGATIVE);
+}
+
 extern void d_set_fallthru(struct dentry *dentry);
 
 static inline bool d_is_fallthru(const struct dentry *dentry)
-- 
2.30.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ