lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240522135444.1685642-13-haakon.bugge@oracle.com>
Date: Wed, 22 May 2024 15:54:44 +0200
From: Håkon Bugge <haakon.bugge@...cle.com>
To: linux-rdma@...r.kernel.org, linux-kernel@...r.kernel.org,
        netdev@...r.kernel.org, rds-devel@....oracle.com
Cc: Jason Gunthorpe <jgg@...pe.ca>, Leon Romanovsky <leon@...nel.org>,
        Saeed Mahameed <saeedm@...dia.com>, Tariq Toukan <tariqt@...dia.com>,
        "David S . Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>, Tejun Heo <tj@...nel.org>,
        Lai Jiangshan <jiangshanlai@...il.com>,
        Allison Henderson <allison.henderson@...cle.com>,
        Manjunath Patil <manjunath.b.patil@...cle.com>,
        Mark Zhang <markzhang@...dia.com>,
        Håkon Bugge <haakon.bugge@...cle.com>,
        Chuck Lever <chuck.lever@...cle.com>,
        Shiraz Saleem <shiraz.saleem@...el.com>,
        Yang Li <yang.lee@...ux.alibaba.com>
Subject: [PATCH v3 6/6] workqueue: Inherit per-process allocation flags

For drivers/modules running inside a memalloc_flags_{save,restore}
region, if a work-queue is created, we make sure work executed on the
work-queue inherits the same flag(s).

This in order to conditionally enable drivers to work aligned with
block I/O devices. This commit makes sure that any work queued later
on work-queues created during module initialization, when current's
flags has any of the PF_MEMALLOC* set, will inherit the same flags.

We do this in order to enable drivers to be used as a network block
I/O device. This in order to support XFS or other file-systems on top
of a raw block device which uses said drivers as the network transport
layer.

Under intense memory pressure, we get memory reclaims. Assume the
file-system reclaims memory, goes to the raw block device, which calls
into said drivers. Now, if regular GFP_KERNEL allocations in the
drivers require reclaims to be fulfilled, we end up in a circular
dependency.

We break this circular dependency by:

1. Force all allocations in the drivers to use GFP_NOIO, by means of a
   parenthetic use of memalloc_flags_{save,restore} on all relevant
   entry points, setting/clearing the PF_MEMALLOC_NOIO bit.

2. Make sure work-queues inherits current->flags
   wrt. PF_MEMALLOC_NOIO, such that work executed on the
   work-queue inherits the same flag(s). That is what this commit
   contributes with.

Signed-off-by: Håkon Bugge <haakon.bugge@...cle.com>

---

v2 -> v3:
   * Add support for all PF_MEMALLOC* flags
   * Re-worded commit message

v1 -> v2:
   * Added missing hunk in alloc_workqueue()
---
 include/linux/workqueue.h |  9 ++++++
 kernel/workqueue.c        | 60 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index fb39938945365..f8c87f824272b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -406,9 +406,18 @@ enum wq_flags {
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
+	__WQ_MEMALLOC		= 1 << 19, /* internal: execute work with MEMALLOC */
+	__WQ_MEMALLOC_NOFS      = 1 << 20, /* internal: execute work with MEMALLOC_NOFS */
+	__WQ_MEMALLOC_NOIO      = 1 << 21, /* internal: execute work with MEMALLOC_NOIO */
+	__WQ_MEMALLOC_NORECLAIM = 1 << 22, /* internal: execute work with MEMALLOC_NORECLAIM */
+	__WQ_MEMALLOC_NOWARN    = 1 << 23, /* internal: execute work with MEMALLOC_NOWARN */
+	__WQ_MEMALLOC_PIN	= 1 << 24, /* internal: execute work with MEMALLOC_PIN */
 
 	/* BH wq only allows the following flags */
 	__WQ_BH_ALLOWS		= WQ_BH | WQ_HIGHPRI,
+
+	__WQ_PF_MEMALLOC_MASK	= PF_MEMALLOC | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOIO |
+				  PF_MEMALLOC_NORECLAIM | PF_MEMALLOC_NOWARN | PF_MEMALLOC_PIN,
 };
 
 enum wq_consts {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 003474c9a77d0..28ed6b9556e91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,6 +51,7 @@
 #include <linux/uaccess.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/mm.h>
 #include <linux/nmi.h>
 #include <linux/kvm_para.h>
 #include <linux/delay.h>
@@ -3113,6 +3114,28 @@ static bool manage_workers(struct worker *worker)
 	return true;
 }
 
+static unsigned int wq_build_memalloc_flags(struct pool_workqueue *pwq)
+{
+	unsigned int pf_flags = 0;
+
+#define BUILD_PF_FLAGS_FROM_WQ(name)			\
+	do {						\
+		if (pwq->wq->flags & __WQ_ ## name)	\
+			pf_flags |= PF_ ## name;	\
+	} while (0)
+
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC);
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOFS);
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOIO);
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NORECLAIM);
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOWARN);
+	BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_PIN);
+
+#undef BUILD_PF_FLAGS_FROM_WQ
+
+	return pf_flags;
+}
+
 /**
  * process_one_work - process single work
  * @worker: self
@@ -3136,6 +3159,8 @@ __acquires(&pool->lock)
 	unsigned long work_data;
 	int lockdep_start_depth, rcu_start_depth;
 	bool bh_draining = pool->flags & POOL_BH_DRAINING;
+	unsigned int memalloc_flags = wq_build_memalloc_flags(pwq);
+	unsigned int memalloc_flags_old;
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * It is permissible to free the struct work_struct from
@@ -3148,6 +3173,10 @@ __acquires(&pool->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+	/* Set inherited alloc flags */
+	if (memalloc_flags)
+		memalloc_flags_old = memalloc_flags_save(memalloc_flags);
+
 	/* ensure we're on the correct CPU */
 	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
 		     raw_smp_processor_id() != pool->cpu);
@@ -3284,6 +3313,10 @@ __acquires(&pool->lock)
 
 	/* must be the last step, see the function comment */
 	pwq_dec_nr_in_flight(pwq, work_data);
+
+	/* Restore alloc flags */
+	if (memalloc_flags)
+		memalloc_flags_restore(memalloc_flags_old);
 }
 
 /**
@@ -5637,6 +5670,30 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
 	} while (activated);
 }
 
+/**
+ * wq_set_memalloc_flags - Test current->flags for PF_MEMALLOC_FOO_BAR
+ * flag bits and set the corresponding __WQ_MEMALLOC_FOO_BAR in the
+ * WQ's flags variable.
+ * @flags_ptr: Pointer to wq->flags
+ */
+static void wq_set_memalloc_flags(unsigned int *flags_ptr)
+{
+#define TEST_PF_SET_WQ(name)				\
+	do {						\
+		if (current->flags & PF_ ## name)	\
+			*flags_ptr |= __WQ_ ## name;	\
+	} while (0)
+
+	TEST_PF_SET_WQ(MEMALLOC);
+	TEST_PF_SET_WQ(MEMALLOC_NOFS);
+	TEST_PF_SET_WQ(MEMALLOC_NOIO);
+	TEST_PF_SET_WQ(MEMALLOC_NORECLAIM);
+	TEST_PF_SET_WQ(MEMALLOC_NOWARN);
+	TEST_PF_SET_WQ(MEMALLOC_PIN);
+
+#undef TEST_PF_SET_WQ
+}
+
 __printf(1, 4)
 struct workqueue_struct *alloc_workqueue(const char *fmt,
 					 unsigned int flags,
@@ -5695,6 +5752,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	/* init wq */
 	wq->flags = flags;
+	if (current->flags & __WQ_PF_MEMALLOC_MASK)
+		wq_set_memalloc_flags(&wq->flags);
+
 	wq->max_active = max_active;
 	wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
 	wq->saved_max_active = wq->max_active;
-- 
2.31.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ