lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y4+RPry2tfbWFdSA@cmpxchg.org>
Date:   Tue, 6 Dec 2022 20:00:14 +0100
From:   Johannes Weiner <hannes@...xchg.org>
To:     Ivan Babrou <ivan@...udflare.com>
Cc:     Linux MM <linux-mm@...ck.org>,
        Linux Kernel Network Developers <netdev@...r.kernel.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Michal Hocko <mhocko@...nel.org>,
        Roman Gushchin <roman.gushchin@...ux.dev>,
        Shakeel Butt <shakeelb@...gle.com>,
        Muchun Song <songmuchun@...edance.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Eric Dumazet <edumazet@...gle.com>,
        "David S. Miller" <davem@...emloft.net>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>, cgroups@...r.kernel.org,
        kernel-team <kernel-team@...udflare.com>
Subject: Re: Low TCP throughput due to vmpressure with swap enabled

On Mon, Dec 05, 2022 at 04:50:46PM -0800, Ivan Babrou wrote:
> And now I can see plenty of this:
> 
> [  108.156707][ T5175] socket pressure[2]: 4294673429
> [  108.157050][ T5175] socket pressure[2]: 4294673429
> [  108.157301][ T5175] socket pressure[2]: 4294673429
> [  108.157581][ T5175] socket pressure[2]: 4294673429
> [  108.157874][ T5175] socket pressure[2]: 4294673429
> [  108.158254][ T5175] socket pressure[2]: 4294673429
> 
> I think the first result below is to blame:
> 
> $ rg '.->socket_pressure' mm
> mm/memcontrol.c
> 5280: memcg->socket_pressure = jiffies;
> 7198: memcg->socket_pressure = 0;
> 7201: memcg->socket_pressure = 1;
> 7211: memcg->socket_pressure = 0;
> 7215: memcg->socket_pressure = 1;

Hoo boy, that's a silly mistake indeed. Thanks for tracking it down.

> While we set socket_pressure to either zero or one in
> mem_cgroup_charge_skmem, it is still initialized to jiffies on memcg
> creation. Zero seems like a more appropriate starting point. With that
> change I see it working as expected with no TCP speed bumps. My
> ebpf_exporter program also looks happy and reports zero clamps in my
> brief testing.

Excellent, now this behavior makes sense.

> I also think we should downgrade socket_pressure from "unsigned long"
> to "bool", as it only holds zero and one now.

Sounds good to me!

Attaching the updated patch below. If nobody has any objections, I'll
add a proper changelog, reported-bys, sign-off etc and send it out.

---
 include/linux/memcontrol.h |  8 +++---
 include/linux/vmpressure.h |  7 ++---
 mm/memcontrol.c            | 20 +++++++++----
 mm/vmpressure.c            | 58 ++++++--------------------------------
 mm/vmscan.c                | 15 +---------
 5 files changed, 30 insertions(+), 78 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e1644a24009c..ef1c388be5b3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -283,11 +283,11 @@ struct mem_cgroup {
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
 	atomic_long_t		memory_events_local[MEMCG_NR_MEMORY_EVENTS];
 
-	unsigned long		socket_pressure;
+	/* Socket memory allocations have failed */
+	bool			socket_pressure;
 
 	/* Legacy tcp memory accounting */
 	bool			tcpmem_active;
-	int			tcpmem_pressure;
 
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
@@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk);
 void mem_cgroup_sk_free(struct sock *sk);
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure)
 		return true;
 	do {
-		if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
+		if (memcg->socket_pressure)
 			return true;
 	} while ((memcg = parent_mem_cgroup(memcg)));
 	return false;
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6a2f51ebbfd3..20d93de37a17 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -11,9 +11,6 @@
 #include <linux/eventfd.h>
 
 struct vmpressure {
-	unsigned long scanned;
-	unsigned long reclaimed;
-
 	unsigned long tree_scanned;
 	unsigned long tree_reclaimed;
 	/* The lock is used to keep the scanned/reclaimed above in sync. */
@@ -30,7 +27,7 @@ struct vmpressure {
 struct mem_cgroup;
 
 #ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
 		       unsigned long scanned, unsigned long reclaimed);
 extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
 
@@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
 extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
 					struct eventfd_ctx *eventfd);
 #else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
 			      unsigned long scanned, unsigned long reclaimed) {}
 static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
 				   int prio) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..0d4b9dbe775a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5277,7 +5277,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
-	memcg->socket_pressure = jiffies;
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
 	INIT_LIST_HEAD(&memcg->objcg_list);
@@ -7195,10 +7194,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
 		struct page_counter *fail;
 
 		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
-			memcg->tcpmem_pressure = 0;
+			memcg->socket_pressure = false;
 			return true;
 		}
-		memcg->tcpmem_pressure = 1;
+		memcg->socket_pressure = true;
 		if (gfp_mask & __GFP_NOFAIL) {
 			page_counter_charge(&memcg->tcpmem, nr_pages);
 			return true;
@@ -7206,12 +7205,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
 		return false;
 	}
 
-	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
-		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
-		return true;
+	if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) {
+		memcg->socket_pressure = false;
+		goto success;
+	}
+	memcg->socket_pressure = true;
+	if (gfp_mask & __GFP_NOFAIL) {
+		try_charge(memcg, gfp_mask, nr_pages);
+		goto success;
 	}
 
 	return false;
+
+success:
+	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
+	return true;
 }
 
 /**
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index b52644771cc4..4cec90711cf4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -219,7 +219,6 @@ static void vmpressure_work_fn(struct work_struct *work)
  * vmpressure() - Account memory pressure through scanned/reclaimed ratio
  * @gfp:	reclaimer's gfp mask
  * @memcg:	cgroup memory controller handle
- * @tree:	legacy subtree mode
  * @scanned:	number of pages scanned
  * @reclaimed:	number of pages reclaimed
  *
@@ -227,16 +226,9 @@ static void vmpressure_work_fn(struct work_struct *work)
  * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
  * pressure index is then further refined and averaged over time.
  *
- * If @tree is set, vmpressure is in traditional userspace reporting
- * mode: @memcg is considered the pressure root and userspace is
- * notified of the entire subtree's reclaim efficiency.
- *
- * If @tree is not set, reclaim efficiency is recorded for @memcg, and
- * only in-kernel users are notified.
- *
  * This function does not return any value.
  */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
 		unsigned long scanned, unsigned long reclaimed)
 {
 	struct vmpressure *vmpr;
@@ -271,46 +263,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 	if (!scanned)
 		return;
 
-	if (tree) {
-		spin_lock(&vmpr->sr_lock);
-		scanned = vmpr->tree_scanned += scanned;
-		vmpr->tree_reclaimed += reclaimed;
-		spin_unlock(&vmpr->sr_lock);
-
-		if (scanned < vmpressure_win)
-			return;
-		schedule_work(&vmpr->work);
-	} else {
-		enum vmpressure_levels level;
-
-		/* For now, no users for root-level efficiency */
-		if (!memcg || mem_cgroup_is_root(memcg))
-			return;
-
-		spin_lock(&vmpr->sr_lock);
-		scanned = vmpr->scanned += scanned;
-		reclaimed = vmpr->reclaimed += reclaimed;
-		if (scanned < vmpressure_win) {
-			spin_unlock(&vmpr->sr_lock);
-			return;
-		}
-		vmpr->scanned = vmpr->reclaimed = 0;
-		spin_unlock(&vmpr->sr_lock);
+	spin_lock(&vmpr->sr_lock);
+	scanned = vmpr->tree_scanned += scanned;
+	vmpr->tree_reclaimed += reclaimed;
+	spin_unlock(&vmpr->sr_lock);
 
-		level = vmpressure_calc_level(scanned, reclaimed);
-
-		if (level > VMPRESSURE_LOW) {
-			/*
-			 * Let the socket buffer allocator know that
-			 * we are having trouble reclaiming LRU pages.
-			 *
-			 * For hysteresis keep the pressure state
-			 * asserted for a second in which subsequent
-			 * pressure events can occur.
-			 */
-			WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
-		}
-	}
+	if (scanned < vmpressure_win)
+		return;
+	schedule_work(&vmpr->work);
 }
 
 /**
@@ -340,7 +300,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 	 * to the vmpressure() basically means that we signal 'critical'
 	 * level.
 	 */
-	vmpressure(gfp, memcg, true, vmpressure_win, 0);
+	vmpressure(gfp, memcg, vmpressure_win, 0);
 }
 
 #define MAX_VMPRESSURE_ARGS_LEN	(strlen("critical") + strlen("hierarchy") + 2)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..d348366d58d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6035,8 +6035,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 	memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
-		unsigned long reclaimed;
-		unsigned long scanned;
 
 		/*
 		 * This loop can become CPU-bound when target memcgs
@@ -6068,20 +6066,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			memcg_memory_event(memcg, MEMCG_LOW);
 		}
 
-		reclaimed = sc->nr_reclaimed;
-		scanned = sc->nr_scanned;
-
 		shrink_lruvec(lruvec, sc);
-
 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
 			    sc->priority);
-
-		/* Record the group's reclaim efficiency */
-		if (!sc->proactive)
-			vmpressure(sc->gfp_mask, memcg, false,
-				   sc->nr_scanned - scanned,
-				   sc->nr_reclaimed - reclaimed);
-
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }
 
@@ -6111,7 +6098,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 	/* Record the subtree's reclaim efficiency */
 	if (!sc->proactive)
-		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
 			   sc->nr_reclaimed - nr_reclaimed);
 
-- 
2.38.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ