[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:55 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
x86@...nel.org,
luto@...nel.org,
kees@...nel.org,
akpm@...ux-foundation.org,
david@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
peterz@...radead.org
Cc: dietmar.eggemann@....com,
hpa@...or.com,
acme@...nel.org,
namhyung@...nel.org,
mark.rutland@....com,
alexander.shishkin@...ux.intel.com,
jolsa@...nel.org,
irogers@...gle.com,
adrian.hunter@...el.com,
kan.liang@...ux.intel.com,
viro@...iv.linux.org.uk,
brauner@...nel.org,
jack@...e.cz,
lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com,
vbabka@...e.cz,
rppt@...nel.org,
surenb@...gle.com,
mhocko@...e.com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
jannh@...gle.com,
pfalcato@...e.de,
riel@...riel.com,
harry.yoo@...cle.com,
linux-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org,
linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org,
duanxiongchun@...edance.com,
yinhongbo@...edance.com,
dengliang.1214@...edance.com,
xieyongji@...edance.com,
chaiwen.cc@...edance.com,
songmuchun@...edance.com,
yuanzhu@...edance.com,
chengguozhu@...edance.com,
sunjiadong.lff@...edance.com,
Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 27/35] RPAL: add epoll support
To support the epoll family, RPAL needs to add new logic for RPAL services
to the existing epoll logic, ensuring that user mode can execute RPAL
service-related logic through identical interfaces.
When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC
to notify the kernel to invoke RPAL-related logic. The kernel then sets the
receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to
RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the
runqueue, allowing the sender to perform RPAL calls on the receiver thread.
Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
arch/x86/rpal/core.c | 4 +
fs/eventpoll.c | 200 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 21 +++++
kernel/sched/core.c | 17 ++++
4 files changed, 242 insertions(+)
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 47c9e551344e..6a22b9faa100 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -9,6 +9,7 @@
#include <linux/rpal.h>
#include <linux/sched/task_stack.h>
#include <linux/pkeys.h>
+#include <linux/file.h>
#include <asm/fsgsbase.h>
#include "internal.h"
@@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs)
if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
rcc = current->rpal_rd->rcc;
+ regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
} else {
tsk = current->rpal_sd->receiver;
@@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
struct task_struct *prev = current;
if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ rpal_resume_ep(next);
current->rpal_sd->receiver = next;
rpal_lock_cpu(current);
rpal_lock_cpu(next);
@@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
*/
rebuild_sender_stack(current->rpal_sd, regs);
rpal_schedule(next);
+ fdput(next->rpal_rd->f);
} else {
update_dst_stack(next, regs);
/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..437cd5764c03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/rpal.h>
#include <net/busy_poll.h>
/*
@@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
}
+#ifdef CONFIG_RPAL
+
+void rpal_resume_ep(struct task_struct *tsk)
+{
+ struct rpal_receiver_data *rrd = tsk->rpal_rd;
+ struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+ struct rpal_receiver_call_context *rcc = rrd->rcc;
+
+ if (rcc->timeout > 0) {
+ hrtimer_cancel(&rrd->ep_sleeper.timer);
+ destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer);
+ }
+ if (!list_empty_careful(&rrd->ep_wait.entry)) {
+ write_lock(&ep->lock);
+ __remove_wait_queue(&ep->wq, &rrd->ep_wait);
+ write_unlock(&ep->lock);
+ }
+}
+
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc)
+{
+ int eavail;
+ int res = 0;
+
+ res = ep_send_events(ep, rcc->events, rcc->maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+
+ eavail = ep_events_available(ep);
+ if (!eavail) {
+ atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ /* check again to avoid data race on RPAL_KERNEL_PENDING */
+ eavail = ep_events_available(ep);
+ if (eavail)
+ atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ }
+ return res;
+}
+
+static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ const enum hrtimer_mode mode,
+ clockid_t clock_id)
+{
+ struct hrtimer_sleeper *t = ¤t->rpal_rd->ep_sleeper;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && *expires == 0) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "infinite"
+ */
+ if (!expires) {
+ schedule();
+ return -EINTR;
+ }
+
+ hrtimer_setup_sleeper_on_stack(t, clock_id, mode);
+ hrtimer_set_expires_range_ns(&t->timer, *expires, delta);
+ hrtimer_sleeper_start_expires(t, mode);
+
+ if (likely(t->task))
+ schedule();
+
+ hrtimer_cancel(&t->timer);
+ destroy_hrtimer_on_stack(&t->timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t->task ? 0 : -EINTR;
+}
+
+static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout)
+{
+ int res = 0, eavail, timed_out = 0;
+ u64 slack = 0;
+ struct rpal_receiver_data *rrd = current->rpal_rd;
+ wait_queue_entry_t *wait = &rrd->ep_wait;
+ ktime_t expires, *to = NULL;
+
+ rrd->ep = ep;
+
+ lockdep_assert_irqs_enabled();
+
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
+ to = &expires;
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
+ timed_out = 1;
+ }
+
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ res = rpal_try_send_events(ep, rrd->rcc);
+ if (res) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return res;
+ }
+ }
+
+ if (timed_out) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return 0;
+ }
+
+ eavail = ep_busy_loop(ep);
+ if (eavail)
+ continue;
+
+ if (signal_pending(current)) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return -EINTR;
+ }
+
+ init_wait(wait);
+ wait->func = rpal_ep_autoremove_wake_function;
+ wait->private = rrd;
+ write_lock_irq(&ep->lock);
+
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ eavail = ep_events_available(ep);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, wait);
+
+ write_unlock_irq(&ep->lock);
+
+ if (!eavail && ep_schedule_timeout(to)) {
+ if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) {
+ timed_out = 1;
+ } else {
+ timed_out =
+ !rpal_schedule_hrtimeout_range_clock(
+ to, slack, HRTIMER_MODE_ABS,
+ CLOCK_MONOTONIC);
+ }
+ }
+ atomic_cmpxchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_RUNNING);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
+ */
+ eavail = 1;
+
+ if (!list_empty_careful(&wait->entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait->entry);
+ __remove_wait_queue(&ep->wq, wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
+}
+#endif
+
/**
* ep_loop_check_proc - verify that adding an epoll file inside another
* epoll structure does not violate the constraints, in
@@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;
/* Time to fish for events ... */
+#ifdef CONFIG_RPAL
+ /*
+ * For RPAL task, if it is a receiver and it set MAGIC in shared memory,
+ * We think it is prepared for rpal calls. Therefore, we need to handle
+ * it differently.
+ *
+ * In other cases, RPAL task always plays like a normal task.
+ */
+ if (rpal_current_service() &&
+ rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) &&
+ current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) {
+ current->rpal_rd->f = f;
+ return rpal_ep_poll(ep, events, maxevents, to);
+ } else {
+ return ep_poll(ep, events, maxevents, to);
+ }
+#else
return ep_poll(ep, events, maxevents, to);
+#endif
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f2474cb53abe..5912ffec6e28 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -16,6 +16,8 @@
#include <linux/hashtable.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>
#define RPAL_ERROR_MSG "rpal error: "
#define rpal_err(x...) pr_err(RPAL_ERROR_MSG x)
@@ -89,6 +91,7 @@ enum {
};
#define RPAL_ERROR_MAGIC 0x98CC98CC
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98
#define RPAL_SID_SHIFT 24
#define RPAL_ID_SHIFT 8
@@ -103,6 +106,9 @@ enum {
#define RPAL_PKRU_UNION 1
#define RPAL_PKRU_INTERSECT 2
+#define RPAL_KERNEL_PENDING 0x1
+#define RPAL_USER_PENDING 0x2
+
extern unsigned long rpal_cap;
enum rpal_task_flag_bits {
@@ -282,6 +288,12 @@ struct rpal_receiver_call_context {
int receiver_id;
atomic_t receiver_state;
atomic_t sender_state;
+ atomic_t ep_pending;
+ int rpal_ep_poll_magic;
+ int epfd;
+ void __user *events;
+ int maxevents;
+ int timeout;
};
/* recovery point for sender */
@@ -325,6 +337,10 @@ struct rpal_receiver_data {
struct rpal_shared_page *rsp;
struct rpal_receiver_call_context *rcc;
struct task_struct *sender;
+ void *ep;
+ struct fd f;
+ struct hrtimer_sleeper ep_sleeper;
+ wait_queue_entry_t ep_wait;
};
struct rpal_sender_data {
@@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
void rpal_set_pku_schedule_tail(struct task_struct *prev);
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key);
+void rpal_resume_ep(struct task_struct *tsk);
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb5d5bd51597..486d59bdd3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_RTLOCK_WAIT 2
#ifdef CONFIG_RPAL
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct rpal_receiver_data *rrd = curr->private;
+ struct task_struct *tsk = rrd->rcd.bp_task;
+ int ret;
+
+ ret = try_to_wake_up(tsk, mode, wake_flags);
+
+ list_del_init_careful(&curr->entry);
+ if (!ret)
+ atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending);
+
+ return 1;
+}
+
static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
{
if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
--
2.20.1
Powered by blists - more mailing lists