lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250519131151.988900-2-chenridong@huaweicloud.com>
Date: Mon, 19 May 2025 13:11:50 +0000
From: Chen Ridong <chenridong@...weicloud.com>
To: akpm@...ux-foundation.org,
	Liam.Howlett@...cle.com,
	lorenzo.stoakes@...cle.com,
	vbabka@...e.cz,
	jannh@...gle.com,
	pfalcato@...e.de,
	bigeasy@...utronix.de,
	paulmck@...nel.org,
	chenridong@...wei.com,
	roman.gushchin@...ux.dev,
	brauner@...nel.org,
	pmladek@...e.com,
	geert@...ux-m68k.org,
	mingo@...nel.org,
	rrangel@...omium.org,
	francesco@...la.it,
	kpsingh@...nel.org,
	guoweikang.kernel@...il.com,
	link@...o.com,
	viro@...iv.linux.org.uk,
	neil@...wn.name,
	nichen@...as.ac.cn,
	tglx@...utronix.de,
	frederic@...nel.org,
	peterz@...radead.org,
	oleg@...hat.com,
	joel.granados@...nel.org,
	linux@...ssschuh.net,
	avagin@...gle.com,
	legion@...nel.org
Cc: linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	lujialin4@...wei.com
Subject: [RFC next v2 1/2] ucounts: free ucount only count and rlimit are zero

From: Chen Ridong <chenridong@...wei.com>

After the commit fda31c50292a ("signal: avoid double atomic counter
increments for user accounting") and the commit 15bc01effefe ("ucounts:
Fix signal ucount refcounting"), the reference counting mechanism for
ucounts has the following behavior. The reference count is incremented
when the first pending signal pins to the ucounts, and it is decremented
when the last pending signal is dequeued. This implies that as long as
there are any pending signals pinned to the ucounts, the ucounts cannot
be freed.

To address the scalability issue, the next patch will mention, the
ucounts.rlimits will be converted to percpu_counter. However, summing up
the percpu counters is expensive. To overcome this, this patch modifies
the conditions for freeing ucounts. Instead of complex checks regarding
whether a pending signal is the first or the last one, the ucounts can now
be freed only when both the refcount and the rlimits are zero.
This change not only simplifies the logic but also reduces the number of
atomic operations.

Signed-off-by: Chen Ridong <chenridong@...wei.com>
---
 include/linux/user_namespace.h |  1 +
 kernel/ucount.c                | 75 ++++++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index a0bb6d012137..6e2229ea4673 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -122,6 +122,7 @@ struct ucounts {
 	kuid_t uid;
 	struct rcu_head rcu;
 	rcuref_t count;
+	atomic_long_t freed;
 	atomic_long_t ucount[UCOUNT_COUNTS];
 	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
 };
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8686e329b8f2..125471af7d59 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -185,18 +185,61 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 	return new;
 }
 
-void put_ucounts(struct ucounts *ucounts)
+/*
+ * Whether all the rlimits are zero.
+ * For now, only UCOUNT_RLIMIT_SIGPENDING is considered.
+ * Other rlimit can be added.
+ */
+static bool rlimits_are_zero(struct ucounts *ucounts)
+{
+	int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING };
+	int rtype;
+
+	for (int i = 0; i < sizeof(rtypes)/sizeof(int); ++i) {
+		rtype = rtypes[i];
+		if (atomic_long_read(&ucounts->rlimit[rtype]) > 0)
+			return false;
+	}
+	return true;
+}
+
+/*
+ * Ucounts can be freed only when the ucount->count is released
+ * and the rlimits are zero.
+ * The caller should hold rcu_read_lock();
+ */
+static bool ucounts_can_be_freed(struct ucounts *ucounts)
+{
+	if (rcuref_read(&ucounts->count) > 0)
+		return false;
+	if (!rlimits_are_zero(ucounts))
+		return false;
+	/* Prevent double free */
+	return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0;
+}
+
+static void free_ucounts(struct ucounts *ucounts)
 {
 	unsigned long flags;
 
-	if (rcuref_put(&ucounts->count)) {
-		spin_lock_irqsave(&ucounts_lock, flags);
-		hlist_nulls_del_rcu(&ucounts->node);
-		spin_unlock_irqrestore(&ucounts_lock, flags);
+	spin_lock_irqsave(&ucounts_lock, flags);
+	hlist_nulls_del_rcu(&ucounts->node);
+	spin_unlock_irqrestore(&ucounts_lock, flags);
+
+	put_user_ns(ucounts->ns);
+	kfree_rcu(ucounts, rcu);
+}
 
-		put_user_ns(ucounts->ns);
-		kfree_rcu(ucounts, rcu);
+void put_ucounts(struct ucounts *ucounts)
+{
+	rcu_read_lock();
+	if (rcuref_put(&ucounts->count) &&
+	    ucounts_can_be_freed(ucounts)) {
+		rcu_read_unlock();
+		free_ucounts(ucounts);
+		return;
 	}
+	rcu_read_unlock();
 }
 
 static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
@@ -281,11 +324,17 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
 {
 	struct ucounts *iter, *next;
 	for (iter = ucounts; iter != last; iter = next) {
+		bool to_free;
+
+		rcu_read_lock();
 		long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
 		WARN_ON_ONCE(dec < 0);
 		next = iter->ns->ucounts;
-		if (dec == 0)
-			put_ucounts(iter);
+		to_free = ucounts_can_be_freed(iter);
+		rcu_read_unlock();
+		/* If ucounts->count is zero and the rlimits are zero, free ucounts */
+		if (to_free)
+			free_ucounts(iter);
 	}
 }
 
@@ -310,14 +359,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
 			ret = new;
 		if (!override_rlimit)
 			max = get_userns_rlimit_max(iter->ns, type);
-		/*
-		 * Grab an extra ucount reference for the caller when
-		 * the rlimit count was previously 0.
-		 */
-		if (new != 1)
-			continue;
-		if (!get_ucounts(iter))
-			goto dec_unwind;
 	}
 	return ret;
 dec_unwind:
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ