[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c01ddfc5-9410-14e1-55f7-c24f44447f8a@efficios.com>
Date: Thu, 20 Apr 2023 08:41:05 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Aaron Lu <aaron.lu@...el.com>
Cc: Peter Zijlstra <peterz@...radead.org>,
linux-kernel@...r.kernel.org, Olivier Dion <odion@...icios.com>,
michael.christie@...cle.com
Subject: Re: [RFC PATCH v9 2/2] sched: Fix performance regression introduced
by mm_cid
On 2023-04-20 05:56, Aaron Lu wrote:
> On Wed, Apr 19, 2023 at 11:50:12AM -0400, Mathieu Desnoyers wrote:
>> Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
>> sysbench regression reported by Aaron Lu.
>
> mm_cid_get() dropped to 5.x% after I disable CONFIG_DEBUG_PREEMPT, using
> __this_cpu_X() doesn't help, I suppose that is because __this_cpu_X()
> still needs to fetch mm->pcpu_cid.
>
> Annotate mm_cid_get():
>
> │ static inline int mm_cid_get(struct mm_struct *mm)
> │ {
> 0.05 │ push %rbp
> 0.02 │ mov %rsp,%rbp
> │ push %r15
> │ push %r14
> │ push %r13
> │ push %r12
> │ push %rbx
> 0.02 │ sub $0x10,%rsp
> │ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
> 71.30 │ mov 0x60(%rdi),%r12
> │ struct cpumask *cpumask;
> │ int cid;
> │
> │ lockdep_assert_rq_held(rq);
> │ cpumask = mm_cidmask(mm);
> │ cid = __this_cpu_read(pcpu_cid->cid);
> 28.44 │ mov %gs:0x8(%r12),%edx
> │ if (mm_cid_is_valid(cid)) {
>
>
> sched_mm_cid_migrate_to() is 4.x% and its annotation :
>
> │ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
> │ mov -0x30(%rbp),%rax
> 54.53 │ mov 0x60(%r13),%rbx
> 19.61 │ movslq 0xaf0(%rax),%r15
>
> The reason why accessing mm->pcpu_cid is so costly is still a myth to
> me...
Then we clearly have another member of mm_struct on the same cache line
as pcpu_cid which is bouncing all over the place and causing
false-sharing. Any idea which field(s) are causing this ?
>
> BTW, I used below diff to mitigate the incorrect rq lock issue I
> described in my reply to v8:
Yes, I'll do something similar in my next version, thanks ! I'll also
drop my patch 1/2 from my patchset because clearly I was looking in the
wrong place.
Thanks,
Mathieu
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6e2dd8f4ee3..f16418731866 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -11662,7 +11662,7 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
> return;
> }
> /* Move src_cid to dst cpu. */
> - mm_cid_snapshot_time(mm);
> + mm_cid_snapshot_time(mm, cpu_of(dst_rq));
> WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
> }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d1d470441422..8b6a0c8ed3d1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3348,12 +3348,13 @@ static inline int __mm_cid_try_get(struct mm_struct *mm)
> * Save a snapshot of the current runqueue time of this cpu
> * with the per-cpu cid value, allowing to estimate how recently it was used.
> */
> -static inline void mm_cid_snapshot_time(struct mm_struct *mm)
> +static inline void mm_cid_snapshot_time(struct mm_struct *mm, int cpu)
> {
> - struct rq *rq = this_rq();
> + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
> + struct rq *rq = cpu_rq(cpu);
>
> lockdep_assert_rq_held(rq);
> - __this_cpu_write(mm->pcpu_cid->time, rq->clock);
> + WRITE_ONCE(pcpu_cid->time, rq->clock);
> }
>
> static inline int __mm_cid_get(struct mm_struct *mm)
> @@ -3404,7 +3405,7 @@ static inline int __mm_cid_get(struct mm_struct *mm)
> unlock:
> raw_spin_unlock(&cid_lock);
> end:
> - mm_cid_snapshot_time(mm);
> + mm_cid_snapshot_time(mm, raw_smp_processor_id());
> return cid;
> }
>
> @@ -3419,7 +3420,7 @@ static inline int mm_cid_get(struct mm_struct *mm)
> cpumask = mm_cidmask(mm);
> cid = __this_cpu_read(pcpu_cid->cid);
> if (mm_cid_is_valid(cid)) {
> - mm_cid_snapshot_time(mm);
> + mm_cid_snapshot_time(mm, raw_smp_processor_id());
> return cid;
> }
> if (mm_cid_is_lazy_put(cid)) {
> @@ -3467,7 +3468,7 @@ static inline void switch_mm_cid(struct task_struct *prev,
> */
> }
> if (prev->mm_cid_active) {
> - mm_cid_snapshot_time(prev->mm);
> + mm_cid_snapshot_time(prev->mm, raw_smp_processor_id());
> mm_cid_put_lazy(prev);
> prev->mm_cid = -1;
> }
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists