linux-kernel - [PATCH] workqueue: Support RT workqueue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20251016102345.2200815-1-jackzxcui1989@163.com>
Date: Thu, 16 Oct 2025 18:23:45 +0800
From: Xin Zhao <jackzxcui1989@....com>
To: tj@...nel.org,
	jiangshanlai@...il.com
Cc: linux-kernel@...r.kernel.org,
	Xin Zhao <jackzxcui1989@....com>
Subject: [PATCH] workqueue: Support RT workqueue

In a system with high real-time requirements, we have noticed that many
high-priority tasks, such as kernel threads responsible for dispatching
GPU tasks and receiving data sources, often experience latency spikes
due to insufficient real-time execution of work.
The kworker threads are shared globally based on the attributes of the
workqueue (wq) and the parameters of queue_work_on. This means that
regardless of whether you create a new wq or use an existing one, the
kworker thread that processes the work does not exclusively run any
specific work or work from a specific wq. While this design saves
resources, it makes it difficult to ensure the real-time execution of
work by modifying the priority of the kworker thread associated with a
specific work in hard real-time scenarios. Additionally, if I manually
set the real-time priority of the kworker while executing the work task
and then adjust it back upon completion, the next time queue_work_on is
called, the priority of the kworker thread will have reverted, making it
impossible to ensure timely execution of these lower-priority threads.
Moreover, frequent priority adjustments can incur additional overhead.
Perhaps we could implement all logic related to hard real-time tasks
using kernel threads, but I believe this workload is unnecessary. The
existing workqueue mechanism in the system is well-structured and can
guarantee that work executes in an orderly manner in concurrent scenarios
by adjusting the max_active and WQ_ORDERED attributes. We only need to
introduce a WQ_RT flag and add a small amount of code to meet the
requirements of hard real-time workqueues.

Signed-off-by: Xin Zhao <jackzxcui1989@....com>
---
 include/linux/workqueue.h |  6 ++++++
 kernel/workqueue.c        | 34 ++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 45d5dd470..973876b79 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -16,6 +16,7 @@
 #include <linux/cpumask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue_types.h>
+#include <uapi/linux/sched/types.h>
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -404,6 +405,8 @@ enum wq_flags {
 	WQ_POWER_EFFICIENT	= 1 << 7,
 	WQ_PERCPU		= 1 << 8, /* bound to a specific cpu */
 
+	WQ_RT			= 1 << 9,
+
 	__WQ_DESTROYING		= 1 << 15, /* internal: workqueue is destroying */
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
@@ -460,6 +463,7 @@ enum wq_consts {
 extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
 extern struct workqueue_struct *system_percpu_wq;
 extern struct workqueue_struct *system_highpri_wq;
+extern struct workqueue_struct *system_rt_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_dfl_wq;
@@ -781,6 +785,8 @@ extern void __warn_flushing_systemwide_wq(void)
 	     _wq == system_wq) ||					\
 	    (__builtin_constant_p(_wq == system_highpri_wq) &&		\
 	     _wq == system_highpri_wq) ||				\
+	    (__builtin_constant_p(_wq == system_rt_wq) &&		\
+	     _wq == system_rt_wq) ||					\
 	    (__builtin_constant_p(_wq == system_long_wq) &&		\
 	     _wq == system_long_wq) ||					\
 	    (__builtin_constant_p(_wq == system_unbound_wq) &&		\
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b367..ccbf19e3a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -103,7 +103,7 @@ enum work_cancel_flags {
 };
 
 enum wq_internal_consts {
-	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
+	NR_STD_WORKER_POOLS	= 3,		/* # standard pools per cpu */
 
 	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
@@ -123,6 +123,7 @@ enum wq_internal_consts {
 	 */
 	RESCUER_NICE_LEVEL	= MIN_NICE,
 	HIGHPRI_NICE_LEVEL	= MIN_NICE,
+	RTPRI_LEVEL		= MIN_NICE - 1,
 
 	WQ_NAME_LEN		= 32,
 	WORKER_ID_LEN		= 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
@@ -509,6 +510,8 @@ struct workqueue_struct *system_percpu_wq __ro_after_init;
 EXPORT_SYMBOL(system_percpu_wq);
 struct workqueue_struct *system_highpri_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_rt_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_rt_wq);
 struct workqueue_struct *system_long_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_long_wq);
 struct workqueue_struct *system_unbound_wq __ro_after_init;
@@ -2751,7 +2754,8 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
 		if (pool->cpu >= 0)
 			return scnprintf(buf, size, "kworker/%d:%d%s",
 					 pool->cpu, worker->id,
-					 pool->attrs->nice < 0  ? "H" : "");
+					 pool->attrs->nice < 0 ?
+					 (pool->attrs->nice == RTPRI_LEVEL ? "F" : "H") : "");
 		else
 			return scnprintf(buf, size, "kworker/u%d:%d",
 					 pool->id, worker->id);
@@ -2760,6 +2764,9 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
 	}
 }
 
+static int kworker_rt_prio = 1;
+module_param(kworker_rt_prio, int, 0444);
+
 /**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
@@ -2776,6 +2783,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 {
 	struct worker *worker;
 	int id;
+	struct sched_param sp;
 
 	/* ID is needed to determine kthread name */
 	id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@ -2810,7 +2818,12 @@ static struct worker *create_worker(struct worker_pool *pool)
 			goto fail;
 		}
 
-		set_user_nice(worker->task, pool->attrs->nice);
+		if (pool->attrs->nice == RTPRI_LEVEL) {
+			sp.sched_priority = kworker_rt_prio;
+			sched_setscheduler_nocheck(worker->task, SCHED_FIFO, &sp);
+		} else {
+			set_user_nice(worker->task, pool->attrs->nice);
+		}
 		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
 	}
 
@@ -5470,7 +5483,7 @@ static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
 
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
-	bool highpri = wq->flags & WQ_HIGHPRI;
+	int prio = (wq->flags & WQ_RT) ? 2 : (wq->flags & WQ_HIGHPRI ? 1 : 0);
 	int cpu, ret;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -5491,7 +5504,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 			struct pool_workqueue **pwq_p;
 			struct worker_pool *pool;
 
-			pool = &(per_cpu_ptr(pools, cpu)[highpri]);
+			pool = &(per_cpu_ptr(pools, cpu)[prio]);
 			pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
 
 			*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
@@ -5511,14 +5524,14 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	if (wq->flags & __WQ_ORDERED) {
 		struct pool_workqueue *dfl_pwq;
 
-		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[prio]);
 		/* there should only be single pwq for ordering guarantee */
 		dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
 		WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
 			      wq->pwqs.prev != &dfl_pwq->pwqs_node),
 		     "ordering guarantee broken for workqueue %s\n", wq->name);
 	} else {
-		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[prio]);
 	}
 
 	return ret;
@@ -7720,7 +7733,7 @@ static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int n
 void __init workqueue_init_early(void)
 {
 	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
-	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL, RTPRI_LEVEL };
 	void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
 						       bh_pool_kick_highpri };
 	int i, cpu;
@@ -7805,6 +7818,7 @@ void __init workqueue_init_early(void)
 	system_wq = alloc_workqueue("events", 0, 0);
 	system_percpu_wq = alloc_workqueue("events", 0, 0);
 	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
+	system_rt_wq = alloc_workqueue("events_rt", WQ_RT, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
@@ -7818,8 +7832,8 @@ void __init workqueue_init_early(void)
 	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
 	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
 					       WQ_BH | WQ_HIGHPRI, 0);
-	BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
-	       !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
+	BUG_ON(!system_wq || !system_percpu_wq || !system_highpri_wq || !system_rt_wq ||
+	       !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq ||
 	       !system_bh_wq || !system_bh_highpri_wq);
-- 
2.34.1