linux-kernel - [RFC v2 27/35] RPAL: add epoll support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:55 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 27/35] RPAL: add epoll support

To support the epoll family, RPAL needs to add new logic for RPAL services
to the existing epoll logic, ensuring that user mode can execute RPAL
service-related logic through identical interfaces.

When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC
to notify the kernel to invoke RPAL-related logic. The kernel then sets the
receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to
RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the
runqueue, allowing the sender to perform RPAL calls on the receiver thread.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/rpal/core.c |   4 +
 fs/eventpoll.c       | 200 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rpal.h |  21 +++++
 kernel/sched/core.c  |  17 ++++
 4 files changed, 242 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 47c9e551344e..6a22b9faa100 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -9,6 +9,7 @@
 #include <linux/rpal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/pkeys.h>
+#include <linux/file.h>
 #include <asm/fsgsbase.h>
 
 #include "internal.h"
@@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs)
 
 	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
 		rcc = current->rpal_rd->rcc;
+		regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
 		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
 	} else {
 		tsk = current->rpal_sd->receiver;
@@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 	struct task_struct *prev = current;
 
 	if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+		rpal_resume_ep(next);
 		current->rpal_sd->receiver = next;
 		rpal_lock_cpu(current);
 		rpal_lock_cpu(next);
@@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 		 */
 		rebuild_sender_stack(current->rpal_sd, regs);
 		rpal_schedule(next);
+		fdput(next->rpal_rd->f);
 	} else {
 		update_dst_stack(next, regs);
 		/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..437cd5764c03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <linux/capability.h>
+#include <linux/rpal.h>
 #include <net/busy_poll.h>
 
 /*
@@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	}
 }
 
+#ifdef CONFIG_RPAL
+
+void rpal_resume_ep(struct task_struct *tsk)
+{
+	struct rpal_receiver_data *rrd = tsk->rpal_rd;
+	struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+
+	if (rcc->timeout > 0) {
+		hrtimer_cancel(&rrd->ep_sleeper.timer);
+		destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer);
+	}
+	if (!list_empty_careful(&rrd->ep_wait.entry)) {
+		write_lock(&ep->lock);
+		__remove_wait_queue(&ep->wq, &rrd->ep_wait);
+		write_unlock(&ep->lock);
+	}
+}
+
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc)
+{
+	int eavail;
+	int res = 0;
+
+	res = ep_send_events(ep, rcc->events, rcc->maxevents);
+	if (res > 0)
+		ep_suspend_napi_irqs(ep);
+
+	eavail = ep_events_available(ep);
+	if (!eavail) {
+		atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending);
+		/* check again to avoid data race on RPAL_KERNEL_PENDING */
+		eavail = ep_events_available(ep);
+		if (eavail)
+			atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending);
+	}
+	return res;
+}
+
+static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+					       const enum hrtimer_mode mode,
+					       clockid_t clock_id)
+{
+	struct hrtimer_sleeper *t = &current->rpal_rd->ep_sleeper;
+
+	/*
+	 * Optimize when a zero timeout value is given. It does not
+	 * matter whether this is an absolute or a relative time.
+	 */
+	if (expires && *expires == 0) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	/*
+	 * A NULL parameter means "infinite"
+	 */
+	if (!expires) {
+		schedule();
+		return -EINTR;
+	}
+
+	hrtimer_setup_sleeper_on_stack(t, clock_id, mode);
+	hrtimer_set_expires_range_ns(&t->timer, *expires, delta);
+	hrtimer_sleeper_start_expires(t, mode);
+
+	if (likely(t->task))
+		schedule();
+
+	hrtimer_cancel(&t->timer);
+	destroy_hrtimer_on_stack(&t->timer);
+
+	__set_current_state(TASK_RUNNING);
+
+	return !t->task ? 0 : -EINTR;
+}
+
+static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+			int maxevents, struct timespec64 *timeout)
+{
+	int res = 0, eavail, timed_out = 0;
+	u64 slack = 0;
+	struct rpal_receiver_data *rrd = current->rpal_rd;
+	wait_queue_entry_t *wait = &rrd->ep_wait;
+	ktime_t expires, *to = NULL;
+
+	rrd->ep = ep;
+
+	lockdep_assert_irqs_enabled();
+
+	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+		slack = select_estimate_accuracy(timeout);
+		to = &expires;
+		*to = timespec64_to_ktime(*timeout);
+	} else if (timeout) {
+		timed_out = 1;
+	}
+
+	eavail = ep_events_available(ep);
+
+	while (1) {
+		if (eavail) {
+			res = rpal_try_send_events(ep, rrd->rcc);
+			if (res) {
+				atomic_xchg(&rrd->rcc->receiver_state,
+					    RPAL_RECEIVER_STATE_RUNNING);
+				return res;
+			}
+		}
+
+		if (timed_out) {
+			atomic_xchg(&rrd->rcc->receiver_state,
+				    RPAL_RECEIVER_STATE_RUNNING);
+			return 0;
+		}
+
+		eavail = ep_busy_loop(ep);
+		if (eavail)
+			continue;
+
+		if (signal_pending(current)) {
+			atomic_xchg(&rrd->rcc->receiver_state,
+				    RPAL_RECEIVER_STATE_RUNNING);
+			return -EINTR;
+		}
+
+		init_wait(wait);
+		wait->func = rpal_ep_autoremove_wake_function;
+		wait->private = rrd;
+		write_lock_irq(&ep->lock);
+
+		atomic_xchg(&rrd->rcc->receiver_state,
+			    RPAL_RECEIVER_STATE_READY);
+		__set_current_state(TASK_INTERRUPTIBLE);
+
+		eavail = ep_events_available(ep);
+		if (!eavail)
+			__add_wait_queue_exclusive(&ep->wq, wait);
+
+		write_unlock_irq(&ep->lock);
+
+		if (!eavail && ep_schedule_timeout(to)) {
+			if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) {
+				timed_out = 1;
+			} else {
+				timed_out =
+					!rpal_schedule_hrtimeout_range_clock(
+						to, slack, HRTIMER_MODE_ABS,
+						CLOCK_MONOTONIC);
+			}
+		}
+		atomic_cmpxchg(&rrd->rcc->receiver_state,
+			       RPAL_RECEIVER_STATE_READY,
+			       RPAL_RECEIVER_STATE_RUNNING);
+		__set_current_state(TASK_RUNNING);
+
+		/*
+		 * We were woken up, thus go and try to harvest some events.
+		 * If timed out and still on the wait queue, recheck eavail
+		 * carefully under lock, below.
+		 */
+		eavail = 1;
+
+		if (!list_empty_careful(&wait->entry)) {
+			write_lock_irq(&ep->lock);
+			/*
+			 * If the thread timed out and is not on the wait queue,
+			 * it means that the thread was woken up after its
+			 * timeout expired before it could reacquire the lock.
+			 * Thus, when wait.entry is empty, it needs to harvest
+			 * events.
+			 */
+			if (timed_out)
+				eavail = list_empty(&wait->entry);
+			__remove_wait_queue(&ep->wq, wait);
+			write_unlock_irq(&ep->lock);
+		}
+	}
+}
+#endif
+
 /**
  * ep_loop_check_proc - verify that adding an epoll file inside another
  *                      epoll structure does not violate the constraints, in
@@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
 	ep = fd_file(f)->private_data;
 
 	/* Time to fish for events ... */
+#ifdef CONFIG_RPAL
+	/*
+	 * For RPAL task, if it is a receiver and it set MAGIC in shared memory,
+	 * We think it is prepared for rpal calls. Therefore, we need to handle
+	 * it differently.
+	 *
+	 * In other cases, RPAL task always plays like a normal task.
+	 */
+	if (rpal_current_service() &&
+	    rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) &&
+	    current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) {
+		current->rpal_rd->f = f;
+		return rpal_ep_poll(ep, events, maxevents, to);
+	} else {
+		return ep_poll(ep, events, maxevents, to);
+	}
+#else
 	return ep_poll(ep, events, maxevents, to);
+#endif
 }
 
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f2474cb53abe..5912ffec6e28 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -16,6 +16,8 @@
 #include <linux/hashtable.h>
 #include <linux/atomic.h>
 #include <linux/sizes.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>
 
 #define RPAL_ERROR_MSG "rpal error: "
 #define rpal_err(x...) pr_err(RPAL_ERROR_MSG x)
@@ -89,6 +91,7 @@ enum {
 };
 
 #define RPAL_ERROR_MAGIC 0x98CC98CC
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98
 
 #define RPAL_SID_SHIFT 24
 #define RPAL_ID_SHIFT 8
@@ -103,6 +106,9 @@ enum {
 #define RPAL_PKRU_UNION 1
 #define RPAL_PKRU_INTERSECT 2
 
+#define RPAL_KERNEL_PENDING 0x1
+#define RPAL_USER_PENDING 0x2
+
 extern unsigned long rpal_cap;
 
 enum rpal_task_flag_bits {
@@ -282,6 +288,12 @@ struct rpal_receiver_call_context {
 	int receiver_id;
 	atomic_t receiver_state;
 	atomic_t sender_state;
+	atomic_t ep_pending;
+	int rpal_ep_poll_magic;
+	int epfd;
+	void __user *events;
+	int maxevents;
+	int timeout;
 };
 
 /* recovery point for sender */
@@ -325,6 +337,10 @@ struct rpal_receiver_data {
 	struct rpal_shared_page *rsp;
 	struct rpal_receiver_call_context *rcc;
 	struct task_struct *sender;
+	void *ep;
+	struct fd f;
+	struct hrtimer_sleeper ep_sleeper;
+	wait_queue_entry_t ep_wait;
 };
 
 struct rpal_sender_data {
@@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
 asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
 int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
 void rpal_set_pku_schedule_tail(struct task_struct *prev);
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+	unsigned int mode, int wake_flags,
+	void *key);
+void rpal_resume_ep(struct task_struct *tsk);
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb5d5bd51597..486d59bdd3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #define SM_RTLOCK_WAIT		2
 
 #ifdef CONFIG_RPAL
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+				     unsigned int mode, int wake_flags,
+				     void *key)
+{
+	struct rpal_receiver_data *rrd = curr->private;
+	struct task_struct *tsk = rrd->rcd.bp_task;
+	int ret;
+
+	ret = try_to_wake_up(tsk, mode, wake_flags);
+
+	list_del_init_careful(&curr->entry);
+	if (!ret)
+		atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending);
+
+	return 1;
+}
+
 static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
 {
 	if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
-- 
2.20.1