lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <b13520ef51366f6c25c50f05de7210d37fcd9489.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:28:02 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 34/35] RPAL: enable fast epoll wait

When a kernel event occurs during an RPAL call and triggers a lazy switch,
the kernel context switches from the sender to the receiver. When the
receiver later returns from user space to the sender, a second lazy switch
is required to switch the kernel context back to the sender. In the current
implementation, after the second lazy switch, the receiver returns to user
space via rpal_kernel_ret() and then calls epoll_wait() from user space to
re-enter the kernel. This causes the receiver to be unable to process epoll
events for a long period, degrading performance.

This patch introduces a fast epoll wait feature. During the second lazy
switch, the kernel configures epoll-related data structures so that the
receiver can directly enter the epoll wait state without first returning
to user space and then calling epoll_wait(). The patch adds a new state
RPAL_RECEIVER_STATE_READY_LS, which is used to mark that the receiver can
transition to RPAL_RECEIVER_STATE_WAIT during the second lazy switch. The
kernel then performs this state transition in rpal_lazy_switch_tail().

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/rpal/core.c |  29 ++++++++++++-
 fs/eventpoll.c       | 101 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rpal.h |   3 ++
 kernel/sched/core.c  |  13 +++++-
 4 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 2ac5d932f69c..7b6efde23e48 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -51,7 +51,25 @@ void rpal_lazy_switch_tail(struct task_struct *tsk)
 		atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
 			       RPAL_RECEIVER_STATE_LAZY_SWITCH);
 	} else {
+		/* tsk is receiver */
+		int state;
+
+		rcc = tsk->rpal_rd->rcc;
+		state = atomic_read(&rcc->receiver_state);
+		/* receiver may be scheduled on another cpu after unlock. */
 		rpal_unlock_cpu(tsk);
+		/*
+		 * We must not use RPAL_RECEIVER_STATE_READY instead of
+		 * RPAL_RECEIVER_STATE_READY_LS. As receiver may at
+		 * TASK_RUNNING state and then call epoll_wait() again,
+		 * the state may become RPAL_RECEIVER_STATE_READY, we should
+		 * not changed its state to RPAL_RECEIVER_STATE_WAIT since
+		 * the state is set by another RPAL call.
+		 */
+		if (state == RPAL_RECEIVER_STATE_READY_LS)
+			atomic_cmpxchg(&rcc->receiver_state,
+				       RPAL_RECEIVER_STATE_READY_LS,
+				       RPAL_RECEIVER_STATE_WAIT);
 		rpal_unlock_cpu(current);
 	}
 }
@@ -63,8 +81,14 @@ void rpal_kernel_ret(struct pt_regs *regs)
 	int state;
 
 	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
-		rcc = current->rpal_rd->rcc;
-		regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
+		struct rpal_receiver_data *rrd = current->rpal_rd;
+
+		rcc = rrd->rcc;
+		if (rcc->timeout > 0)
+			hrtimer_cancel(&rrd->ep_sleeper.timer);
+		rpal_remove_ep_wait_list(rrd);
+		regs->ax = rpal_try_send_events(rrd->ep, rcc);
+		fdput(rrd->f);
 		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
 	} else {
 		tsk = current->rpal_sd->receiver;
@@ -173,6 +197,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 		 * Otherwise, sender's user context will be corrupted.
 		 */
 		rebuild_receiver_stack(current->rpal_rd, regs);
+		rpal_fast_ep_poll(current->rpal_rd, regs);
 		rpal_schedule(next);
 		rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
 		prev->rpal_rd->sender = NULL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 791321639561..b70c1cd82335 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2143,6 +2143,107 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 }
 
 #ifdef CONFIG_RPAL
+static void *rpal_get_eventpoll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+	int epfd = rcc->epfd;
+	struct epoll_event __user *events = rcc->events;
+	int maxevents = rcc->maxevents;
+	struct file *file;
+
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) {
+		regs->ax = -EINVAL;
+		return NULL;
+	}
+
+	if (!access_ok(events, maxevents * sizeof(struct epoll_event))) {
+		regs->ax = -EFAULT;
+		return NULL;
+	}
+
+	rrd->f = fdget(epfd);
+	file = fd_file(rrd->f);
+	if (!file) {
+		regs->ax = -EBADF;
+		return NULL;
+	}
+
+	if (!is_file_epoll(file)) {
+		regs->ax = -EINVAL;
+		fdput(rrd->f);
+		return NULL;
+	}
+
+	rrd->ep = file->private_data;
+	return rrd->ep;
+}
+
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+	struct eventpoll *ep;
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+	ktime_t ts = 0;
+	struct hrtimer *ht = &rrd->ep_sleeper.timer;
+	int state;
+	int avail;
+
+	regs->orig_ax = __NR_epoll_wait;
+	ep = rpal_get_eventpoll(rrd, regs);
+
+	if (!ep || signal_pending(current) ||
+	    unlikely(ep_events_available(ep)) ||
+	    atomic_read(&rcc->ep_pending) || unlikely(rcc->timeout == 0)) {
+		INIT_LIST_HEAD(&rrd->ep_wait.entry);
+	} else {
+		/*
+		 * Here we use RPAL_RECEIVER_STATE_READY_LS to avoid conflict with
+		 * RPAL_RECEIVER_STATE_READY. As the RPAL_RECEIVER_STATE_READY_LS
+		 * is convert to RPAL_RECEIVER_STATE_WAIT in rpal_lazy_switch_tail(),
+		 * it is possible the receiver is woken at that time. Thus,
+		 * rpal_lazy_switch_tail() should figure out whether the receiver
+		 * state is set by lazy switch or not. See rpal_lazy_switch_tail()
+		 * for details.
+		 */
+		state = atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_READY_LS);
+		if (unlikely(state != RPAL_RECEIVER_STATE_LAZY_SWITCH))
+			rpal_err("%s: unexpected state: %d\n", __func__, state);
+		init_waitqueue_func_entry(&rrd->ep_wait, rpal_ep_autoremove_wake_function);
+		rrd->ep_wait.private = rrd;
+		INIT_LIST_HEAD(&rrd->ep_wait.entry);
+		write_lock(&ep->lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		avail = ep_events_available(ep);
+		if (!avail)
+			__add_wait_queue_exclusive(&ep->wq, &rrd->ep_wait);
+		write_unlock(&ep->lock);
+		if (avail) {
+			/* keep state consistent when we enter rpal_kernel_ret() */
+			atomic_set(&rcc->receiver_state,
+				   RPAL_RECEIVER_STATE_LAZY_SWITCH);
+			set_current_state(TASK_RUNNING);
+			return;
+		}
+
+		if (rcc->timeout > 0) {
+			rrd->ep_sleeper.task = rrd->rcd.bp_task;
+			ts = ms_to_ktime(rcc->timeout);
+			hrtimer_start(ht, ts, HRTIMER_MODE_REL);
+		}
+	}
+}
+
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd)
+{
+	struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+	wait_queue_entry_t *wait = &rrd->ep_wait;
+
+	if (!list_empty_careful(&wait->entry)) {
+		write_lock_irq(&ep->lock);
+		__remove_wait_queue(&ep->wq, wait);
+		write_unlock_irq(&ep->lock);
+	}
+}
+
 void *rpal_get_epitemep(wait_queue_entry_t *wait)
 {
 	struct epitem *epi = ep_item_from_wait(wait);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f5f4da63f28c..676113f0ba1f 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -126,6 +126,7 @@ enum rpal_receiver_state {
 	RPAL_RECEIVER_STATE_WAIT,
 	RPAL_RECEIVER_STATE_CALL,
 	RPAL_RECEIVER_STATE_LAZY_SWITCH,
+	RPAL_RECEIVER_STATE_READY_LS,
 	RPAL_RECEIVER_STATE_MAX,
 };
 
@@ -627,4 +628,6 @@ void rpal_resume_ep(struct task_struct *tsk);
 void *rpal_get_epitemep(wait_queue_entry_t *wait);
 int rpal_get_epitemfd(wait_queue_entry_t *wait);
 int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd);
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6f8e0d76fc0..1728b04d1387 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3965,6 +3965,11 @@ static bool rpal_check_state(struct task_struct *p)
 		case RPAL_RECEIVER_STATE_LAZY_SWITCH:
 		case RPAL_RECEIVER_STATE_RUNNING:
 			break;
+		/*
+		 * Allow RPAL_RECEIVER_STATE_READY_LS to be woken will cause irq
+		 * being enabled in rpal_unlock_cpu.
+		 */
+		case RPAL_RECEIVER_STATE_READY_LS:
 		case RPAL_RECEIVER_STATE_CALL:
 			rpal_set_task_thread_flag(p, RPAL_WAKE_BIT);
 			ret = false;
@@ -11403,7 +11408,13 @@ void __sched notrace rpal_schedule(struct task_struct *next)
 
 	prev_state = READ_ONCE(prev->__state);
 	if (prev_state) {
-		try_to_block_task(rq, prev, &prev_state);
+		if (!try_to_block_task(rq, prev, &prev_state)) {
+			/*
+			 * As the task enter TASK_RUNNING state, we should clean up
+			 * RPAL_RECEIVER_STATE_READY_LS status.
+			 */
+			rpal_check_ready_state(prev, RPAL_RECEIVER_STATE_READY_LS);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
-- 
2.20.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ