[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <8620f69a3726e2e818305027ef79605decbd4148.camel@redhat.com>
Date: Wed, 27 Aug 2025 08:55:18 +0200
From: Gabriele Monaco <gmonaco@...hat.com>
To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: linux-kernel@...r.kernel.org, Andrew Morton <akpm@...ux-foundation.org>,
David Hildenbrand <david@...hat.com>, Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>, "Paul E. McKenney"
<paulmck@...nel.org>, linux-mm@...ck.org, Thomas Gleixner
<tglx@...utronix.de>
Subject: Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from
rseq_handle_notify_resume()
On Tue, 2025-08-26 at 14:01 -0400, Mathieu Desnoyers wrote:
> On 2025-07-16 12:06, Gabriele Monaco wrote:
> > Currently the mm_cid_compaction is triggered by the scheduler tick
> > and
> > runs in a task_work, behaviour is more unpredictable with periodic
> > tasks
> > with short runtime, which may rarely run during a tick.
> >
> > Run the mm_cid_compaction from the rseq_handle_notify_resume()
> > call,
> > which runs from resume_user_mode_work. Since the context is the
> > same
> > where the task_work would run, skip this step and call the
> > compaction
> > function directly.
> > The compaction function still exits prematurely in case the scan is
> > not
> > required, that is when the pseudo-period of 100ms did not elapse.
> >
> > Keep a tick handler used for long running tasks that are never
> > preempted
> > (i.e. that never call rseq_handle_notify_resume), which triggers a
> > compaction and mm_cid update only in that case.
>
> Your approach looks good, but please note that this will probably
> need to be rebased on top of the rseq rework from Thomas Gleixner.
>
> Latest version can be found here:
>
> https://lore.kernel.org/lkml/20250823161326.635281786@linutronix.de/
>
Mmh that's quite a large one, thanks for sharing!
I'm going to have a look but it might make sense to wait until that's
included, I guess.
Thanks,
Gabriele
> Thanks,
>
> Mathieu
>
> >
> > Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> > ---
> > include/linux/mm.h | 2 ++
> > include/linux/mm_types.h | 11 ++++++++
> > include/linux/sched.h | 2 +-
> > kernel/rseq.c | 2 ++
> > kernel/sched/core.c | 55 +++++++++++++++++++++++++----------
> > -----
> > kernel/sched/sched.h | 2 ++
> > 6 files changed, 53 insertions(+), 21 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index fa538feaa8d95..cc8c1c9ae26c1 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct
> > task_struct *t);
> > void sched_mm_cid_after_execve(struct task_struct *t);
> > void sched_mm_cid_fork(struct task_struct *t);
> > void sched_mm_cid_exit_signals(struct task_struct *t);
> > +void task_mm_cid_work(struct task_struct *t);
> > static inline int task_mm_cid(struct task_struct *t)
> > {
> > return t->mm_cid;
> > @@ -2303,6 +2304,7 @@ static inline void
> > sched_mm_cid_before_execve(struct task_struct *t) { }
> > static inline void sched_mm_cid_after_execve(struct task_struct
> > *t) { }
> > static inline void sched_mm_cid_fork(struct task_struct *t) { }
> > static inline void sched_mm_cid_exit_signals(struct task_struct
> > *t) { }
> > +static inline void task_mm_cid_work(struct task_struct *t) { }
> > static inline int task_mm_cid(struct task_struct *t)
> > {
> > /*
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d6b91e8a66d6d..e6d6e468e64b4 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1420,6 +1420,13 @@ static inline void
> > mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
> > WRITE_ONCE(mm->nr_cpus_allowed,
> > cpumask_weight(mm_allowed));
> > raw_spin_unlock(&mm->cpus_allowed_lock);
> > }
> > +
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > + if (!mm)
> > + return false;
> > + return time_after(jiffies, READ_ONCE(mm-
> > >mm_cid_next_scan));
> > +}
> > #else /* CONFIG_SCHED_MM_CID */
> > static inline void mm_init_cid(struct mm_struct *mm, struct
> > task_struct *p) { }
> > static inline int mm_alloc_cid(struct mm_struct *mm, struct
> > task_struct *p) { return 0; }
> > @@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
> > return 0;
> > }
> > static inline void mm_set_cpus_allowed(struct mm_struct *mm,
> > const struct cpumask *cpumask) { }
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > + return false;
> > +}
> > #endif /* CONFIG_SCHED_MM_CID */
> >
> > struct mmu_gather;
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index aa9c5be7a6325..a75f61cea2271 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1428,7 +1428,7 @@ struct task_struct {
> > int last_mm_cid; /* Most
> > recent cid in mm */
> > int migrate_from_cpu;
> > int mm_cid_active; /* Whether
> > cid bitmap is active */
> > - struct callback_head cid_work;
> > + unsigned long last_cid_reset; /*
> > Time of last reset in jiffies */
> > #endif
> >
> > struct tlbflush_unmap_batch tlb_ubc;
> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> > index b7a1ec327e811..100f81e330dc6 100644
> > --- a/kernel/rseq.c
> > +++ b/kernel/rseq.c
> > @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal
> > *ksig, struct pt_regs *regs)
> > }
> > if (unlikely(rseq_update_cpu_node_id(t)))
> > goto error;
> > + /* The mm_cid compaction returns prematurely if scan is
> > not needed. */
> > + task_mm_cid_work(t);
> > return;
> >
> > error:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 81c6df746df17..27b856a1cb0a9 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -10589,22 +10589,13 @@ static void
> > sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
> > sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
> > }
> >
> > -static void task_mm_cid_work(struct callback_head *work)
> > +void task_mm_cid_work(struct task_struct *t)
> > {
> > unsigned long now = jiffies, old_scan, next_scan;
> > - struct task_struct *t = current;
> > struct cpumask *cidmask;
> > - struct mm_struct *mm;
> > int weight, cpu;
> > + struct mm_struct *mm = t->mm;
> >
> > - WARN_ON_ONCE(t != container_of(work, struct task_struct,
> > cid_work));
> > -
> > - work->next = work; /* Prevent double-add */
> > - if (t->flags & PF_EXITING)
> > - return;
> > - mm = t->mm;
> > - if (!mm)
> > - return;
> > old_scan = READ_ONCE(mm->mm_cid_next_scan);
> > next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> > if (!old_scan) {
> > @@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct
> > *t)
> > if (mm_users == 1)
> > mm->mm_cid_next_scan = jiffies +
> > msecs_to_jiffies(MM_CID_SCAN_DELAY);
> > }
> > - t->cid_work.next = &t->cid_work; /* Protect against
> > double add */
> > - init_task_work(&t->cid_work, task_mm_cid_work);
> > }
> >
> > void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> > {
> > - struct callback_head *work = &curr->cid_work;
> > - unsigned long now = jiffies;
> > + u64 rtime = curr->se.sum_exec_runtime - curr-
> > >se.prev_sum_exec_runtime;
> >
> > + /*
> > + * If a task is running unpreempted for a long time, it
> > won't get its
> > + * mm_cid compacted and won't update its mm_cid value
> > after a
> > + * compaction occurs.
> > + * For such a task, this function does two things:
> > + * A) trigger the mm_cid recompaction,
> > + * B) trigger an update of the task's rseq->mm_cid field
> > at some point
> > + * after recompaction, so it can get a mm_cid value closer
> > to 0.
> > + * A change in the mm_cid triggers an rseq_preempt.
> > + *
> > + * B occurs once after the compaction work completes,
> > neither A nor B
> > + * run as long as the compaction work is pending, the task
> > is exiting
> > + * or is not a userspace task.
> > + */
> > if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD))
> > ||
> > - work->next != work)
> > + test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
> > return;
> > - if (time_before(now, READ_ONCE(curr->mm-
> > >mm_cid_next_scan)))
> > + if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
> > return;
> > -
> > - /* No page allocation under rq lock */
> > - task_work_add(curr, work, TWA_RESUME);
> > + if (mm_cid_needs_scan(curr->mm)) {
> > + /* Trigger mm_cid recompaction */
> > + rseq_set_notify_resume(curr);
> > + } else if (time_after(jiffies, curr->last_cid_reset +
> > +
> > msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> > + /* Update mm_cid field */
> > + int old_cid = curr->mm_cid;
> > +
> > + if (!curr->mm_cid_active)
> > + return;
> > + mm_cid_snapshot_time(rq, curr->mm);
> > + mm_cid_put_lazy(curr);
> > + curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq,
> > curr, curr->mm);
> > + if (old_cid != curr->mm_cid)
> > + rseq_preempt(curr);
> > + }
> > }
> >
> > void sched_mm_cid_exit_signals(struct task_struct *t)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 475bb5998295e..90a5b58188232 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
> >
> > #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /*
> > 100ms */
> > #define MM_CID_SCAN_DELAY 100 /* 100ms
> > */
> > +#define RSEQ_UNPREEMPTED_THRESHOLD SCHED_MM_CID_PERIOD_NS
> >
> > extern raw_spinlock_t cid_lock;
> > extern int use_cid_lock;
> > @@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq,
> > struct task_struct *t,
> > int cid;
> >
> > lockdep_assert_rq_held(rq);
> > + t->last_cid_reset = jiffies;
> > cpumask = mm_cidmask(mm);
> > cid = __this_cpu_read(pcpu_cid->cid);
> > if (mm_cid_is_valid(cid)) {
>
Powered by blists - more mailing lists