linux-kernel - Re: [PATCH v13 2/3] sched: Move task_mm_cid_work to mm work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f5ded505-6df3-4a70-a41c-7333c50fed4e@redhat.com>
Date: Tue, 22 Apr 2025 09:27:42 +0000 (UTC)
From: Gabriele Monaco <gmonaco@...hat.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: linux-kernel@...r.kernel.org, Ingo Molnar <mingo@...hat.org>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Subject: Re: [PATCH v13 2/3] sched: Move task_mm_cid_work to mm work_struct

2025-04-14T15:28:34Z Mathieu Desnoyers <mathieu.desnoyers@...icios.com>:

> On 2025-04-14 08:36, Gabriele Monaco wrote:
>> Currently, the task_mm_cid_work function is called in a task work
>> triggered by a scheduler tick to frequently compact the mm_cids of each
>> process. This can delay the execution of the corresponding thread for
>> the entire duration of the function, negatively affecting the response
>> in case of real time tasks. In practice, we observe task_mm_cid_work
>> increasing the latency of 30-35us on a 128 cores system, this order of
>> magnitude is meaningful under PREEMPT_RT.
>> Run the task_mm_cid_work in a new work_struct connected to the
>> mm_struct rather than in the task context before returning to
>> userspace.
>> This work_struct is initialised with the mm and disabled before freeing
>> it. The queuing of the work happens while returning to userspace in
>> __rseq_handle_notify_resume, maintaining the checks to avoid running
>> more frequently than MM_CID_SCAN_DELAY.
>> To make sure this happens predictably also on long running tasks, we
>> trigger a call to __rseq_handle_notify_resume also from the scheduler
>> tick if the runtime exceeded a 100ms threshold.
>> The main advantage of this change is that the function can be offloaded
>> to a different CPU and even preempted by RT tasks.
>> Moreover, this new behaviour is more predictable with periodic tasks
>> with short runtime, which may rarely run during a scheduler tick.
>> Now, the work is always scheduled when the task returns to userspace.
>> The work is disabled during mmdrop, since the function cannot sleep in
>> all kernel configurations, we cannot wait for possibly running work
>> items to terminate. We make sure the mm is valid in case the task is
>> terminating by reserving it with mmgrab/mmdrop, returning prematurely if
>> we are really the last user while the work gets to run.
>> This situation is unlikely since we don't schedule the work for exiting
>> tasks, but we cannot rule it out.
>
> The implementation looks good to me. Peter, how does it look from your end ?
>

Peter, what do you think about this version? Can we bring it in?

Thanks,
Gabriele

> Thanks,
>
> Mathieu
>
>> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
>> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
>> ---
>>   include/linux/mm_types.h | 26 ++++++++++++++
>>   include/linux/sched.h    |  8 ++++-
>>   kernel/rseq.c            |  2 ++
>>   kernel/sched/core.c      | 75 ++++++++++++++++++++++++++--------------
>>   kernel/sched/sched.h     |  6 ++--
>>   5 files changed, 89 insertions(+), 28 deletions(-)
>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index 56d07edd01f91..e4ae9295508cf 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -982,6 +982,10 @@ struct mm_struct {
>>          * mm nr_cpus_allowed updates.
>>          */
>>         raw_spinlock_t cpus_allowed_lock;
>> +       /*
>> +        * @cid_work: Work item to run the mm_cid scan.
>> +        */
>> +       struct work_struct cid_work;
>>   #endif
>>   #ifdef CONFIG_MMU
>>         atomic_long_t pgtables_bytes;   /* size of all page tables */
>> @@ -1282,6 +1286,8 @@ enum mm_cid_state {
>>     MM_CID_LAZY_PUT = (1U << 31),
>>   };
>>   +extern void task_mm_cid_work(struct work_struct *work);
>> +
>>   static inline bool mm_cid_is_unset(int cid)
>>   {
>>     return cid == MM_CID_UNSET;
>> @@ -1354,12 +1360,14 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
>>     if (!mm->pcpu_cid)
>>         return -ENOMEM;
>>     mm_init_cid(mm, p);
>> +   INIT_WORK(&mm->cid_work, task_mm_cid_work);
>>     return 0;
>>   }
>>   #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
>>     static inline void mm_destroy_cid(struct mm_struct *mm)
>>   {
>> +   disable_work(&mm->cid_work);
>>     free_percpu(mm->pcpu_cid);
>>     mm->pcpu_cid = NULL;
>>   }
>> @@ -1381,6 +1389,16 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>>     WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
>>     raw_spin_unlock(&mm->cpus_allowed_lock);
>>   }
>> +
>> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
>> +{
>> +   return mm && !time_before(jiffies, READ_ONCE(mm->mm_cid_next_scan));
>> +}
>> +
>> +static inline bool mm_cid_scan_pending(struct mm_struct *mm)
>> +{
>> +   return mm && work_pending(&mm->cid_work);
>> +}
>>   #else /* CONFIG_SCHED_MM_CID */
>>   static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
>>   static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
>> @@ -1391,6 +1409,14 @@ static inline unsigned int mm_cid_size(void)
>>     return 0;
>>   }
>>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
>> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
>> +{
>> +   return false;
>> +}
>> +static inline bool mm_cid_scan_pending(struct mm_struct *mm)
>> +{
>> +   return false;
>> +}
>>   #endif /* CONFIG_SCHED_MM_CID */
>>     struct mmu_gather;
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index f96ac19828934..3ffdb96ef6b0a 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1424,7 +1424,7 @@ struct task_struct {
>>     int             last_mm_cid;    /* Most recent cid in mm */
>>     int             migrate_from_cpu;
>>     int             mm_cid_active;  /* Whether cid bitmap is active */
>> -   struct callback_head        cid_work;
>> +   unsigned long           last_cid_reset; /* Time of last reset in jiffies */
>>   #endif
>>         struct tlbflush_unmap_batch tlb_ubc;
>> @@ -2281,4 +2281,10 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
>>   #define alloc_tag_restore(_tag, _old)     do {} while (0)
>>   #endif
>>   +#ifdef CONFIG_SCHED_MM_CID
>> +extern void task_queue_mm_cid(struct task_struct *curr);
>> +#else
>> +static inline void task_queue_mm_cid(struct task_struct *curr) { }
>> +#endif
>> +
>>   #endif
>> diff --git a/kernel/rseq.c b/kernel/rseq.c
>> index b7a1ec327e811..383db2ccad4d0 100644
>> --- a/kernel/rseq.c
>> +++ b/kernel/rseq.c
>> @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>>     }
>>     if (unlikely(rseq_update_cpu_node_id(t)))
>>         goto error;
>> +   if (mm_cid_needs_scan(t->mm))
>> +       task_queue_mm_cid(t);
>>     return;
>>     error:
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index c81cf642dba05..02b18649e6a09 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -10566,22 +10566,16 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>>     sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
>>   }
>>   -static void task_mm_cid_work(struct callback_head *work)
>> +void task_mm_cid_work(struct work_struct *work)
>>   {
>>     unsigned long now = jiffies, old_scan, next_scan;
>> -   struct task_struct *t = current;
>>     struct cpumask *cidmask;
>> -   struct mm_struct *mm;
>> +   struct mm_struct *mm = container_of(work, struct mm_struct, cid_work);
>>     int weight, cpu;
>>   - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
>> -
>> -   work->next = work;  /* Prevent double-add */
>> -   if (t->flags & PF_EXITING)
>> -       return;
>> -   mm = t->mm;
>> -   if (!mm)
>> -       return;
>> +   /* We are the last user, process already terminated. */
>> +   if (atomic_read(&mm->mm_count) == 1)
>> +       goto out_drop;
>>     old_scan = READ_ONCE(mm->mm_cid_next_scan);
>>     next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>>     if (!old_scan) {
>> @@ -10594,9 +10588,9 @@ static void task_mm_cid_work(struct callback_head *work)
>>             old_scan = next_scan;
>>     }
>>     if (time_before(now, old_scan))
>> -       return;
>> +       goto out_drop;
>>     if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
>> -       return;
>> +       goto out_drop;
>>     cidmask = mm_cidmask(mm);
>>     /* Clear cids that were not recently used. */
>>     for_each_possible_cpu(cpu)
>> @@ -10608,6 +10602,8 @@ static void task_mm_cid_work(struct callback_head *work)
>>      */
>>     for_each_possible_cpu(cpu)
>>         sched_mm_cid_remote_clear_weight(mm, cpu, weight);
>> +out_drop:
>> +   mmdrop(mm);
>>   }
>>     void init_sched_mm_cid(struct task_struct *t)
>> @@ -10620,23 +10616,52 @@ void init_sched_mm_cid(struct task_struct *t)
>>         if (mm_users == 1)
>>             mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>>     }
>> -   t->cid_work.next = &t->cid_work;    /* Protect against double add */
>> -   init_task_work(&t->cid_work, task_mm_cid_work);
>>   }
>>   -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
>> +void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
>>   {
>> -   struct callback_head *work = &curr->cid_work;
>> -   unsigned long now = jiffies;
>> +   u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
>>   - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
>> -       work->next != work)
>> -       return;
>> -   if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
>> -       return;
>> +   /*
>> +    * If a task is running unpreempted for a long time, it won't get its
>> +    * mm_cid compacted and won't update its mm_cid value after a
>> +    * compaction occurs.
>> +    * For such a task, this function does two things:
>> +    * A) trigger the mm_cid recompaction,
>> +    * B) trigger an update of the task's rseq->mm_cid field at some point
>> +    * after recompaction, so it can get a mm_cid value closer to 0.
>> +    * A change in the mm_cid triggers an rseq_preempt.
>> +    *
>> +    * A occurs only once after the scan time elapsed, until the next scan
>> +    * expires as well.
>> +    * B occurs once after the compaction work completes, that is when scan
>> +    * is no longer needed (it occurred for this mm) but the last rseq
>> +    * preempt was done before the last mm_cid scan.
>> +    */
>> +   if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
>> +       if (mm_cid_needs_scan(t->mm) && !mm_cid_scan_pending(t->mm))
>> +           rseq_set_notify_resume(t);
>> +       else if (time_after(jiffies, t->last_cid_reset +
>> +                     msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
>> +           int old_cid = t->mm_cid;
>> +
>> +           if (!t->mm_cid_active)
>> +               return;
>> +           mm_cid_snapshot_time(rq, t->mm);
>> +           mm_cid_put_lazy(t);
>> +           t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
>> +           if (old_cid != t->mm_cid)
>> +               rseq_preempt(t);
>> +       }
>> +   }
>> +}
>>   - /* No page allocation under rq lock */
>> -   task_work_add(curr, work, TWA_RESUME);
>> +/* Call only when curr is a user thread. */
>> +void task_queue_mm_cid(struct task_struct *curr)
>> +{
>> +   /* Ensure the mm exists when we run. */
>> +   mmgrab(curr->mm);
>> +   queue_work(system_unbound_wq, &curr->mm->cid_work);
>>   }
>>     void sched_mm_cid_exit_signals(struct task_struct *t)
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 47972f34ea701..c0f8fd4c575c3 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -3582,13 +3582,14 @@ extern const char *preempt_modes[];
>>     #define SCHED_MM_CID_PERIOD_NS  (100ULL * 1000000)  /* 100ms */
>>   #define MM_CID_SCAN_DELAY 100         /* 100ms */
>> +#define RSEQ_UNPREEMPTED_THRESHOLD SCHED_MM_CID_PERIOD_NS
>>     extern raw_spinlock_t cid_lock;
>>   extern int use_cid_lock;
>>     extern void sched_mm_cid_migrate_from(struct task_struct *t);
>>   extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
>> -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
>> +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
>>   extern void init_sched_mm_cid(struct task_struct *t);
>>     static inline void __mm_cid_put(struct mm_struct *mm, int cid)
>> @@ -3798,6 +3799,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
>>     cid = __mm_cid_get(rq, t, mm);
>>     __this_cpu_write(pcpu_cid->cid, cid);
>>     __this_cpu_write(pcpu_cid->recent_cid, cid);
>> +   t->last_cid_reset = jiffies;
>>         return cid;
>>   }
>> @@ -3857,7 +3859,7 @@ static inline void switch_mm_cid(struct rq *rq,
>>   static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
>>   static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
>>   static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
>> -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>> +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
>>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>>   #endif /* !CONFIG_SCHED_MM_CID */
>>  
>
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com