lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210324190626.564297-4-guro@fb.com>
Date:   Wed, 24 Mar 2021 12:06:25 -0700
From:   Roman Gushchin <guro@...com>
To:     Dennis Zhou <dennis@...nel.org>
CC:     Tejun Heo <tj@...nel.org>, Christoph Lameter <cl@...ux.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        <linux-mm@...ck.org>, <linux-kernel@...r.kernel.org>,
        Roman Gushchin <guro@...com>
Subject: [PATCH rfc 3/4] percpu: on demand chunk depopulation

To return unused memory to the system schedule an async
depopulation of percpu chunks.

To balance between scanning too much and creating an overhead because
of the pcpu_lock contention and scanning not enough, let's track an
amount of chunks to scan and mark chunks which are potentially a good
target for the depopulation with a new boolean flag.  The async
depopulation work will clear the flag after trying to depopulate a
chunk (successfully or not).

This commit suggest the following logic: if a chunk
  1) has more than 1/4 of total pages free and populated
  2) isn't a reserved chunk
  3) isn't entirely free
  4) isn't alone in the corresponding slot
it's a good target for depopulation.

If there are 2 or more of such chunks, an async depopulation
is scheduled.

Because chunk population and depopulation are opposite processes
which make a little sense together, split out the shrinking part of
pcpu_balance_populated() into pcpu_grow_populated() and make
pcpu_balance_populated() calling into pcpu_grow_populated() or
pcpu_shrink_populated() conditionally.

Signed-off-by: Roman Gushchin <guro@...com>
---
 mm/percpu-internal.h |   1 +
 mm/percpu.c          | 111 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 18b768ac7dca..1c5b92af02eb 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -67,6 +67,7 @@ struct pcpu_chunk {
 
 	void			*data;		/* chunk data */
 	bool			immutable;	/* no [de]population allowed */
+	bool			depopulate;	/* depopulation hint */
 	int			start_offset;	/* the overlap with the previous
 						   region to have a page aligned
 						   base_addr */
diff --git a/mm/percpu.c b/mm/percpu.c
index 015d076893f5..148137f0fc0b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -178,6 +178,12 @@ static LIST_HEAD(pcpu_map_extend_chunks);
  */
 int pcpu_nr_empty_pop_pages;
 
+/*
+ * Track the number of chunks with a lot of free memory.
+ * It's used to release unused pages to the system.
+ */
+static int pcpu_nr_chunks_to_depopulate;
+
 /*
  * The number of populated pages in use by the allocator, protected by
  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
@@ -1955,6 +1961,11 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->depopulate) {
+			chunk->depopulate = false;
+			pcpu_nr_chunks_to_depopulate--;
+		}
+
 		list_move(&chunk->list, &to_free);
 	}
 
@@ -1976,7 +1987,7 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
 }
 
 /**
- * pcpu_balance_populated - manage the amount of populated pages
+ * pcpu_grow_populated - populate chunk(s) to satisfy atomic allocations
  * @type: chunk type
  *
  * Maintain a certain amount of populated pages to satisfy atomic allocations.
@@ -1985,35 +1996,15 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
  * allocation causes the failure as it is possible that requests can be
  * serviced from already backed regions.
  */
-static void pcpu_balance_populated(enum pcpu_chunk_type type)
+static void pcpu_grow_populated(enum pcpu_chunk_type type, int nr_to_pop)
 {
 	/* gfp flags passed to underlying allocators */
 	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 	struct list_head *pcpu_slot = pcpu_chunk_list(type);
 	struct pcpu_chunk *chunk;
-	int slot, nr_to_pop, ret;
+	int slot, ret;
 
-	/*
-	 * Ensure there are certain number of free populated pages for
-	 * atomic allocs.  Fill up from the most packed so that atomic
-	 * allocs don't increase fragmentation.  If atomic allocation
-	 * failed previously, always populate the maximum amount.  This
-	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
-	 * failing indefinitely; however, large atomic allocs are not
-	 * something we support properly and can be highly unreliable and
-	 * inefficient.
-	 */
 retry_pop:
-	if (pcpu_atomic_alloc_failed) {
-		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
-		/* best effort anyway, don't worry about synchronization */
-		pcpu_atomic_alloc_failed = false;
-	} else {
-		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-				  pcpu_nr_empty_pop_pages,
-				  0, PCPU_EMPTY_POP_PAGES_HIGH);
-	}
-
 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
 		unsigned int nr_unpop = 0, rs, re;
 
@@ -2084,9 +2075,18 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			bool isolated = false;
 
-			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH)
+			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH ||
+			    pcpu_nr_chunks_to_depopulate < 1)
 				break;
 
+			/*
+			 * Don't try to depopulate a chunk again and again.
+			 */
+			if (!chunk->depopulate)
+				continue;
+			chunk->depopulate = false;
+			pcpu_nr_chunks_to_depopulate--;
+
 			for (i = 0, start = -1; i < chunk->nr_pages; i++) {
 				if (!chunk->nr_empty_pop_pages)
 					break;
@@ -2153,6 +2153,41 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type)
 	spin_unlock_irq(&pcpu_lock);
 }
 
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ * @type: chunk type
+ *
+ * Populate or depopulate chunks to maintain a certain amount
+ * of free pages to satisfy atomic allocations, but not waste
+ * large amounts of memory.
+ */
+static void pcpu_balance_populated(enum pcpu_chunk_type type)
+{
+	int nr_to_pop;
+
+	/*
+	 * Ensure there are certain number of free populated pages for
+	 * atomic allocs.  Fill up from the most packed so that atomic
+	 * allocs don't increase fragmentation.  If atomic allocation
+	 * failed previously, always populate the maximum amount.  This
+	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
+	 * failing indefinitely; however, large atomic allocs are not
+	 * something we support properly and can be highly unreliable and
+	 * inefficient.
+	 */
+	if (pcpu_atomic_alloc_failed) {
+		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+		/* best effort anyway, don't worry about synchronization */
+		pcpu_atomic_alloc_failed = false;
+		pcpu_grow_populated(type, nr_to_pop);
+	} else if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages;
+		pcpu_grow_populated(type, nr_to_pop);
+	} else if (pcpu_nr_chunks_to_depopulate > 0) {
+		pcpu_shrink_populated(type);
+	}
+}
+
 /**
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
@@ -2188,6 +2223,7 @@ void free_percpu(void __percpu *ptr)
 	int size, off;
 	bool need_balance = false;
 	struct list_head *pcpu_slot;
+	struct pcpu_chunk *pos;
 
 	if (!ptr)
 		return;
@@ -2207,15 +2243,36 @@ void free_percpu(void __percpu *ptr)
 
 	pcpu_memcg_free_hook(chunk, off, size);
 
-	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_bytes == pcpu_unit_size) {
-		struct pcpu_chunk *pos;
-
+		/*
+		 * If there are more than one fully free chunks,
+		 * wake up grim reaper.
+		 */
 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
 				need_balance = true;
 				break;
 			}
+
+	} else if (chunk->nr_empty_pop_pages > chunk->nr_pages / 4) {
+		/*
+		 * If there is more than one chunk in the slot and
+		 * at least 1/4 of its pages are empty, mark the chunk
+		 * as a target for the depopulation. If there is more
+		 * than one chunk like this, schedule an async balancing.
+		 */
+		int nslot = pcpu_chunk_slot(chunk);
+
+		list_for_each_entry(pos, &pcpu_slot[nslot], list)
+			if (pos != chunk && !chunk->depopulate &&
+			    !chunk->immutable) {
+				chunk->depopulate = true;
+				pcpu_nr_chunks_to_depopulate++;
+				break;
+			}
+
+		if (pcpu_nr_chunks_to_depopulate > 1)
+			need_balance = true;
 	}
 
 	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
-- 
2.30.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ