linux-kernel - [RFC v2 16/35] RPAL: add cpu lock interface

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <8ff6cea94a6438a0856c86a11d56be462314b1f8.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:44 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 16/35] RPAL: add cpu lock interface

Lazy switch enables the kernel to switch from one task to another to keep
the kernel context and user context matched. For the scheduler, both tasks
involved in the context switch must reside in the same run queue (rq).
Therefore, before a lazy switch occurs, the kernel must first bind both
tasks to the same CPU to facilitate the subsequent context switch.

This patch introduces the rpal_lock_cpu() interface, which binds two tasks
to the same CPU while bypassing cpumask restrictions. The rpal_unlock_cpu()
function serves as the inverse operation to release this binding. To ensure
consistency, the kernel must prevent other threads from modifying the CPU
affinity of tasks locked by rpal_lock_cpu(). Therefore, when using
set_cpus_allowed_ptr() to change a task's CPU affinity, other threads must
wait until the binding established by rpal_lock_cpu() is released before
proceeding with modifications.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/rpal/core.c   |  18 +++++++
 arch/x86/rpal/thread.c |  14 ++++++
 include/linux/rpal.h   |   8 +++
 kernel/sched/core.c    | 109 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 149 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 61f5d40b0157..c185a453c1b2 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -15,6 +15,24 @@ int __init rpal_init(void);
 bool rpal_inited;
 unsigned long rpal_cap;
 
+static inline void rpal_lock_cpu(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, true);
+	if (unlikely(!irqs_disabled())) {
+		local_irq_disable();
+		rpal_err("%s: irq is enabled\n", __func__);
+	}
+}
+
+static inline void rpal_unlock_cpu(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, false);
+	if (unlikely(!irqs_disabled())) {
+		local_irq_disable();
+		rpal_err("%s: irq is enabled\n", __func__);
+	}
+}
+
 int __init rpal_init(void)
 {
 	int ret = 0;
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index e50a4c865ff8..bc203e9c6e5e 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -47,6 +47,10 @@ int rpal_register_sender(unsigned long addr)
 	}
 
 	rpal_common_data_init(&rsd->rcd);
+	if (rpal_init_thread_pending(&rsd->rcd)) {
+		ret = -ENOMEM;
+		goto free_rsd;
+	}
 	rsd->rsp = rsp;
 	rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
 						       rsp->kernel_start);
@@ -58,6 +62,8 @@ int rpal_register_sender(unsigned long addr)
 
 	return 0;
 
+free_rsd:
+	kfree(rsd);
 put_shared_page:
 	rpal_put_shared_page(rsp);
 out:
@@ -77,6 +83,7 @@ int rpal_unregister_sender(void)
 
 	rpal_put_shared_page(rsd->rsp);
 	rpal_clear_current_thread_flag(RPAL_SENDER_BIT);
+	rpal_free_thread_pending(&rsd->rcd);
 	kfree(rsd);
 
 	atomic_dec(&cur->thread_cnt);
@@ -116,6 +123,10 @@ int rpal_register_receiver(unsigned long addr)
 	}
 
 	rpal_common_data_init(&rrd->rcd);
+	if (rpal_init_thread_pending(&rrd->rcd)) {
+		ret = -ENOMEM;
+		goto free_rrd;
+	}
 	rrd->rsp = rsp;
 	rrd->rcc =
 		(struct rpal_receiver_call_context *)(addr - rsp->user_start +
@@ -128,6 +139,8 @@ int rpal_register_receiver(unsigned long addr)
 
 	return 0;
 
+free_rrd:
+	kfree(rrd);
 put_shared_page:
 	rpal_put_shared_page(rsp);
 out:
@@ -147,6 +160,7 @@ int rpal_unregister_receiver(void)
 
 	rpal_put_shared_page(rrd->rsp);
 	rpal_clear_current_thread_flag(RPAL_RECEIVER_BIT);
+	rpal_free_thread_pending(&rrd->rcd);
 	kfree(rrd);
 
 	atomic_dec(&cur->thread_cnt);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 4f4719bb7eae..5b115be14a55 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -99,6 +99,7 @@ extern unsigned long rpal_cap;
 enum rpal_task_flag_bits {
 	RPAL_SENDER_BIT,
 	RPAL_RECEIVER_BIT,
+	RPAL_CPU_LOCKED_BIT,
 };
 
 enum rpal_receiver_state {
@@ -270,8 +271,12 @@ struct rpal_shared_page {
 struct rpal_common_data {
 	/* back pointer to task_struct */
 	struct task_struct *bp_task;
+	/* pending struct for cpu locking */
+	void *pending;
 	/* service id of rpal_service */
 	int service_id;
+	/* cpumask before locked */
+	cpumask_t old_mask;
 };
 
 struct rpal_receiver_data {
@@ -464,4 +469,7 @@ struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
 extern void rpal_pick_mmap_base(struct mm_struct *mm,
 	struct rlimit *rlim_stack);
 int rpal_try_to_wake_up(struct task_struct *p);
+int rpal_init_thread_pending(struct rpal_common_data *rcd);
+void rpal_free_thread_pending(struct rpal_common_data *rcd);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 045e92ee2e3b..a862bf4a0161 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3155,6 +3155,104 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 	return ret;
 }
 
+#ifdef CONFIG_RPAL
+int rpal_init_thread_pending(struct rpal_common_data *rcd)
+{
+	struct set_affinity_pending *pending;
+
+	pending = kzalloc(sizeof(*pending), GFP_KERNEL);
+	if (!pending)
+		return -ENOMEM;
+	pending->stop_pending = 0;
+	pending->arg = (struct migration_arg){
+		.task = current,
+		.pending = NULL,
+	};
+	rcd->pending = pending;
+	return 0;
+}
+
+void rpal_free_thread_pending(struct rpal_common_data *rcd)
+{
+	if (rcd->pending != NULL)
+		kfree(rcd->pending);
+}
+
+/*
+ * CPU lock is forced and all cpumask will be ignored by RPAL temporary.
+ */
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+{
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	struct set_affinity_pending *pending = p->rpal_cd->pending;
+	struct cpumask mask;
+	unsigned int dest_cpu;
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+	struct affinity_context ac = {
+		.new_mask = &mask,
+		.flags = 0,
+	};
+
+	if (unlikely(p->flags & PF_KTHREAD))
+		rpal_err("p: %d, p->flags & PF_KTHREAD\n", p->pid);
+
+	rq = task_rq_lock(p, &rf);
+
+	if (is_lock) {
+		cpumask_copy(&p->rpal_cd->old_mask, &p->cpus_mask);
+		cpumask_clear(&mask);
+		cpumask_set_cpu(smp_processor_id(), &mask);
+		rpal_set_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+	} else {
+		cpumask_copy(&mask, &p->rpal_cd->old_mask);
+		rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+	}
+
+	update_rq_clock(rq);
+
+	if (cpumask_equal(&p->cpus_mask, ac.new_mask))
+		goto out;
+	/*
+	 * Picking a ~random cpu helps in cases where we are changing affinity
+	 * for groups of tasks (ie. cpuset), so that load balancing is not
+	 * immediately required to distribute the tasks within their new mask.
+	 */
+	dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ac.new_mask);
+	if (dest_cpu >= nr_cpu_ids) {
+		ret = -EINVAL;
+		goto out;
+	}
+	__do_set_cpus_allowed(p, &ac);
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		preempt_disable();
+		task_rq_unlock(rq, p, &rf);
+		preempt_enable();
+	} else {
+		pending->arg.dest_cpu = dest_cpu;
+
+		if (task_on_cpu(rq, p) ||
+		    READ_ONCE(p->__state) == TASK_WAKING) {
+			preempt_disable();
+			task_rq_unlock(rq, p, &rf);
+			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+					    &pending->arg, &pending->stop_work);
+		} else {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, &rf, p, dest_cpu);
+			task_rq_unlock(rq, p, &rf);
+		}
+	}
+
+	return 0;
+
+out:
+	task_rq_unlock(rq, p, &rf);
+	return ret;
+}
+#endif
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -3169,7 +3267,18 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
 	struct rq_flags rf;
 	struct rq *rq;
 
+#ifdef CONFIG_RPAL
+retry:
+	rq = task_rq_lock(p, &rf);
+	if (rpal_test_task_thread_flag(p, RPAL_CPU_LOCKED_BIT)) {
+		update_rq_clock(rq);
+		task_rq_unlock(rq, p, &rf);
+		schedule();
+		goto retry;
+	}
+#else
 	rq = task_rq_lock(p, &rf);
+#endif
 	/*
 	 * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
 	 * flags are set.
-- 
2.20.1