[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250916100953.GG3245006@noisy.programming.kicks-ass.net>
Date: Tue, 16 Sep 2025 12:09:53 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Oleg Nesterov <oleg@...hat.com>
Cc: "Luis Claudio R. Goncalves" <lgoncalv@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Clark Williams <clrkwllms@...nel.org>,
Steven Rostedt <rostedt@...dmis.org>, Tejun Heo <tj@...nel.org>,
David Vernet <dvernet@...a.com>, Barret Rhoden <brho@...gle.com>,
Josh Don <joshdon@...gle.com>, Crystal Wood <crwood@...hat.com>,
linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
Juri Lelli <juri.lelli@...hat.com>, Ben Segall <bsegall@...gle.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Ingo Molnar <mingo@...hat.com>, Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Thomas Gleixner <tglx@...utronix.de>,
Wander Lairson Costa <wander@...hat.com>
Subject: Re: [RESEND PATCH] sched: restore the behavior of put_task_struct()
for non-rt
On Mon, Sep 15, 2025 at 02:35:40PM +0200, Oleg Nesterov wrote:
> On 09/15, Peter Zijlstra wrote:
> >
> > On Mon, Sep 15, 2025 at 08:15:19AM -0300, Luis Claudio R. Goncalves wrote:
> > > Commit 8671bad873eb ("sched: Do not call __put_task_struct() on rt
> > > if pi_blocked_on is set") changed the behavior of put_task_struct()
> > > unconditionally, even when PREEMPT_RT was not enabled, in clear mismatch
> > > with the commit description.
> > >
> > > Restore the previous behavior of put_task_struct() for the PREEMPT_RT
> > > disabled case.
> > >
> > > Fixes: 8671bad873eb ("sched: Do not call __put_task_struct() on rt if pi_blocked_on is set")
> > > Acked-by: Oleg Nesterov <oleg@...hat.com>
> > > Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@...hat.com>
> > > ---
> > >
> > > Note: This patch is a fix motivated by Oleg Nesterov's question at
> > > https://lore.kernel.org/linux-rt-devel/20250728201441.GA4690@redhat.com/
> >
> > Right, but I thought we did want to make this behaviour consistent.
> >
> > Why have !RT behave differently? That is, why isn't this simply a
> > 'buggy' comment/changelog issue?
>
> Well, this was discussed several times, in particular see
> https://lore.kernel.org/all/CAHk-=whtj+aSYftniMRG4xvFE8dmmYyrqcJyPmzStsfj5w9r=w@mail.gmail.com/
>
> And task_struct->rcu_users was introduced to avoid RCU call in put_task_struct() ...
Ah, I forgot about that thing.. Although I had vague memories of that
argument on rcu_assign_pointer() vs RCU_INIT_POINTER().
> But I won't really argue if you decide to remove this !RT optimization, although
> I think it would be better to do this in a separate patch.
Right. So when they wanted to remove that preemptible() clause, I was
like why again do we have this !RT exception at all, and can't we get
rid of that.
If git isn't confusing me again, this got merged in this cycle. But so
far no benchmark came and told us this was a bad idea.
So what do we do now... do we restore the !RT exception (so far there
aren't any numbers to suggest this mattered) or do we let it be for a
bit and then go and clean things up?
---
include/linux/sched.h | 1 -
include/linux/sched/task.h | 37 +++----------------------------------
kernel/bpf/helpers.c | 6 +++---
kernel/exit.c | 21 ++-------------------
kernel/fork.c | 14 +++++++++-----
kernel/sched/core.c | 3 +--
6 files changed, 18 insertions(+), 64 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 644a01bdae70..fd6586c04ed3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1576,7 +1576,6 @@ struct task_struct {
# endif
#endif
struct rcu_head rcu;
- refcount_t rcu_users;
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ea41795a352b..1125c44b205a 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -122,41 +122,12 @@ static inline struct task_struct *tryget_task_struct(struct task_struct *t)
return refcount_inc_not_zero(&t->usage) ? t : NULL;
}
-extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
static inline void put_task_struct(struct task_struct *t)
{
- if (!refcount_dec_and_test(&t->usage))
- return;
-
- /*
- * Under PREEMPT_RT, we can't call __put_task_struct
- * in atomic context because it will indirectly
- * acquire sleeping locks. The same is true if the
- * current process has a mutex enqueued (blocked on
- * a PI chain).
- *
- * In !RT, it is always safe to call __put_task_struct().
- * Though, in order to simplify the code, resort to the
- * deferred call too.
- *
- * call_rcu() will schedule __put_task_struct_rcu_cb()
- * to be called in process context.
- *
- * __put_task_struct() is called when
- * refcount_dec_and_test(&t->usage) succeeds.
- *
- * This means that it can't "conflict" with
- * put_task_struct_rcu_user() which abuses ->rcu the same
- * way; rcu_users has a reference so task->usage can't be
- * zero after rcu_users 1 -> 0 transition.
- *
- * delayed_free_task() also uses ->rcu, but it is only called
- * when it fails to fork a process. Therefore, there is no
- * way it can conflict with __put_task_struct().
- */
- call_rcu(&t->rcu, __put_task_struct_rcu_cb);
+ if (refcount_dec_and_test(&t->usage))
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}
DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
@@ -164,11 +135,9 @@ DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
static inline void put_task_struct_many(struct task_struct *t, int nr)
{
if (refcount_sub_and_test(nr, &t->usage))
- __put_task_struct(t);
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}
-void put_task_struct_rcu_user(struct task_struct *task);
-
/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8af62cb243d9..bdc4e65bca5c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2459,7 +2459,7 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struc
*/
__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
- if (refcount_inc_not_zero(&p->rcu_users))
+ if (refcount_inc_not_zero(&p->usage))
return p;
return NULL;
}
@@ -2470,12 +2470,12 @@ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
*/
__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
- put_task_struct_rcu_user(p);
+ put_task_struct(p);
}
__bpf_kfunc void bpf_task_release_dtor(void *p)
{
- put_task_struct_rcu_user(p);
+ put_task_struct(p);
}
CFI_NOSEAL(bpf_task_release_dtor);
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..240a6faa0e26 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -220,23 +220,6 @@ static void __exit_signal(struct release_task_post *post, struct task_struct *ts
tty_kref_put(tty);
}
-static void delayed_put_task_struct(struct rcu_head *rhp)
-{
- struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-
- kprobe_flush_task(tsk);
- rethook_flush_task(tsk);
- perf_event_delayed_put(tsk);
- trace_sched_process_free(tsk);
- put_task_struct(tsk);
-}
-
-void put_task_struct_rcu_user(struct task_struct *task)
-{
- if (refcount_dec_and_test(&task->rcu_users))
- call_rcu(&task->rcu, delayed_put_task_struct);
-}
-
void __weak release_thread(struct task_struct *dead_task)
{
}
@@ -305,7 +288,7 @@ void release_task(struct task_struct *p)
if (thread_group_leader(p))
flush_sigqueue(&p->signal->shared_pending);
- put_task_struct_rcu_user(p);
+ put_task_struct(p);
p = leader;
if (unlikely(zap_leader))
@@ -1057,7 +1040,7 @@ void __noreturn make_task_dead(int signr)
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
tsk->exit_state = EXIT_DEAD;
- refcount_inc(&tsk->rcu_users);
+ get_task_struct(tsk);
do_task_dead();
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 33dfb82af25b..383c811626a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,7 @@
#include <linux/pidfs.h>
#include <linux/tick.h>
#include <linux/unwind_deferred.h>
+#include <linux/kprobes.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -729,12 +730,17 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
-void __put_task_struct(struct task_struct *tsk)
+static void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
+ perf_event_delayed_put(tsk);
+ trace_sched_process_free(tsk);
+
unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
@@ -915,12 +921,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
dup_user_cpus_ptr(tsk, orig, node);
/*
- * One for the user space visible state that goes away when reaped.
+ * One for user space visible state that goes away when reaped.
* One for the scheduler.
*/
- refcount_set(&tsk->rcu_users, 2);
- /* One for the rcu users */
- refcount_set(&tsk->usage, 1);
+ refcount_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da2062de97a2..ad3c0e9f3eb4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5259,8 +5259,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
/* Task is done with its stack. */
put_task_stack(prev);
-
- put_task_struct_rcu_user(prev);
+ put_task_struct(prev);
}
return rq;
Powered by blists - more mailing lists