lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250819114932.597600-6-dev@lankhorst.se>
Date: Tue, 19 Aug 2025 13:49:34 +0200
From: Maarten Lankhorst <dev@...khorst.se>
To: Lucas De Marchi <lucas.demarchi@...el.com>,
	'Thomas Hellström' <thomas.hellstrom@...ux.intel.com>,
	Rodrigo Vivi <rodrigo.vivi@...el.com>,
	David Airlie <airlied@...il.com>,
	Simona Vetter <simona@...ll.ch>,
	Maarten Lankhorst <dev@...khorst.se>,
	Maxime Ripard <mripard@...nel.org>,
	Natalie Vock <natalie.vock@....de>,
	Tejun Heo <tj@...nel.org>,
	Johannes Weiner <hannes@...xchg.org>,
	'Michal Koutný' <mkoutny@...e.com>,
	Michal Hocko <mhocko@...nel.org>,
	Roman Gushchin <roman.gushchin@...ux.dev>,
	Shakeel Butt <shakeel.butt@...ux.dev>,
	Muchun Song <muchun.song@...ux.dev>,
	Andrew Morton <akpm@...ux-foundation.org>,
	David Hildenbrand <david@...hat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
	"'Liam R . Howlett'" <Liam.Howlett@...cle.com>,
	Vlastimil Babka <vbabka@...e.cz>,
	Mike Rapoport <rppt@...nel.org>,
	Suren Baghdasaryan <surenb@...gle.com>,
	Thomas Zimmermann <tzimmermann@...e.de>
Cc: Michal Hocko <mhocko@...e.com>,
	intel-xe@...ts.freedesktop.org,
	dri-devel@...ts.freedesktop.org,
	linux-kernel@...r.kernel.org,
	cgroups@...r.kernel.org,
	linux-mm@...ck.org
Subject: [RFC 1/3] page_counter: Allow for pinning some amount of memory

Add a pinned member, and use it for implementing pinning accounting.
Memory to be pinned has to already be accounted for as normally used,
and only memory up to the 'min' limit is allowed to be pinned.

This limit is chosen because cgroups already guarantees that memory
up to that limit will not evicted.

Pinned memory affects min and low calculations, so alter those slightly.

Signed-off-by: Maarten Lankhorst <dev@...khorst.se>
---
 include/linux/page_counter.h |  8 +++
 mm/page_counter.c            | 98 +++++++++++++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc871..5836c6dfb3c76 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -13,6 +13,7 @@ struct page_counter {
 	 * v2. The memcg->memory.usage is a hot member of struct mem_cgroup.
 	 */
 	atomic_long_t usage;
+	atomic_long_t pinned;
 	unsigned long failcnt; /* v1-only field */
 
 	CACHELINE_PADDING(_pad1_);
@@ -68,11 +69,18 @@ static inline unsigned long page_counter_read(struct page_counter *counter)
 	return atomic_long_read(&counter->usage);
 }
 
+static inline unsigned long page_counter_pinned(struct page_counter *counter)
+{
+	return atomic_long_read(&counter->pinned);
+}
+
 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
 bool page_counter_try_charge(struct page_counter *counter,
 			     unsigned long nr_pages,
 			     struct page_counter **fail);
+bool page_counter_try_pin(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_unpin(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127a..d29d0ed01ec18 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -184,6 +184,82 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
 		page_counter_cancel(c, nr_pages);
 }
 
+static void page_counter_unpin_one(struct page_counter *counter, unsigned long nr_pages)
+{
+	long new;
+
+	new = atomic_long_sub_return(nr_pages, &counter->pinned);
+	/* More uncharges than charges? */
+	if (WARN_ONCE(new < 0, "page_counter pinned underflow: %ld nr_pages=%lu\n",
+		      new, nr_pages))
+		atomic_long_set(&counter->pinned, 0);
+}
+
+/**
+ * page_counter_try_pin - try to hierarchically pin pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * Returns %true on success, or %false if any counter goes above,
+ * the 'min' limit. Failing cgroup is not returned, as pinned memory
+ * cannot be evicted.
+ */
+bool page_counter_try_pin(struct page_counter *counter,
+			     unsigned long nr_pages)
+{
+	struct page_counter *c, *fail;
+	bool track_failcnt = counter->track_failcnt;
+
+	for (c = counter; c; c = c->parent) {
+		long new;
+		/*
+		 * Pin speculatively to avoid an expensive CAS.  If
+		 * a bigger charge fails, it might falsely lock out a
+		 * racing smaller charge and send it into reclaim
+		 * early, but the error is limited to the difference
+		 * between the two sizes, which is less than 2M/4M in
+		 * case of a THP locking out a regular page charge.
+		 *
+		 * The atomic_long_add_return() implies a full memory
+		 * barrier between incrementing the count and reading
+		 * the limit.  When racing with page_counter_set_max(),
+		 * we either see the new limit or the setter sees the
+		 * counter has changed and retries.
+		 */
+		new = atomic_long_add_return(nr_pages, &c->pinned);
+		if (new > READ_ONCE(c->min)) {
+			atomic_long_sub(nr_pages, &c->pinned);
+			/*
+			 * This is racy, but we can live with some
+			 * inaccuracy in the failcnt which is only used
+			 * to report stats.
+			 */
+			if (track_failcnt)
+				data_race(c->failcnt++);
+			fail = c;
+			goto failed;
+		}
+	}
+	return true;
+
+failed:
+	for (c = counter; c != fail; c = c->parent)
+		page_counter_unpin_one(c, nr_pages);
+
+	return false;
+}
+
+/**
+ * page_counter_unpin - hierarchically unpin pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_unpin(struct page_counter *counter, unsigned long nr_pages)
+{
+	for (struct page_counter *c = counter; c; c = c->parent)
+		page_counter_unpin_one(c, nr_pages);
+}
+
 /**
  * page_counter_set_max - set the maximum number of pages allowed
  * @counter: counter
@@ -425,7 +501,7 @@ void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
 				       bool recursive_protection)
 {
-	unsigned long usage, parent_usage;
+	unsigned long usage, parent_usage, pinned, min, low;
 	struct page_counter *parent = counter->parent;
 
 	/*
@@ -442,23 +518,31 @@ void page_counter_calculate_protection(struct page_counter *root,
 	if (!usage)
 		return;
 
+	pinned = page_counter_pinned(counter);
+
+	/* For calculation purposes, pinned memory is subtracted from min/low */
+	min = READ_ONCE(counter->min);
+	if (pinned > min)
+		min = 0;
+	low = READ_ONCE(counter->low);
+	if (pinned > low)
+		low = 0;
+
 	if (parent == root) {
-		counter->emin = READ_ONCE(counter->min);
-		counter->elow = READ_ONCE(counter->low);
+		counter->emin = min;
+		counter->elow = low;
 		return;
 	}
 
 	parent_usage = page_counter_read(parent);
 
 	WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
-			READ_ONCE(counter->min),
-			READ_ONCE(parent->emin),
+			min, READ_ONCE(parent->emin),
 			atomic_long_read(&parent->children_min_usage),
 			recursive_protection));
 
 	WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
-			READ_ONCE(counter->low),
-			READ_ONCE(parent->elow),
+			low, READ_ONCE(parent->elow),
 			atomic_long_read(&parent->children_low_usage),
 			recursive_protection));
 }
-- 
2.50.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ