linux-kernel - Re: [RFC PATCH v9 2/2] sched: Fix performance regression introduced by mm

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <c01ddfc5-9410-14e1-55f7-c24f44447f8a@efficios.com>
Date:   Thu, 20 Apr 2023 08:41:05 -0400
From:   Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To:     Aaron Lu <aaron.lu@...el.com>
Cc:     Peter Zijlstra <peterz@...radead.org>,
        linux-kernel@...r.kernel.org, Olivier Dion <odion@...icios.com>,
        michael.christie@...cle.com
Subject: Re: [RFC PATCH v9 2/2] sched: Fix performance regression introduced
 by mm_cid

On 2023-04-20 05:56, Aaron Lu wrote:
> On Wed, Apr 19, 2023 at 11:50:12AM -0400, Mathieu Desnoyers wrote:
>> Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
>> sysbench regression reported by Aaron Lu.
> 
> mm_cid_get() dropped to 5.x% after I disable CONFIG_DEBUG_PREEMPT, using
> __this_cpu_X() doesn't help, I suppose that is because __this_cpu_X()
> still needs to fetch mm->pcpu_cid.
> 
> Annotate mm_cid_get():
> 
>         │     static inline int mm_cid_get(struct mm_struct *mm)
>         │     {
>    0.05 │       push   %rbp
>    0.02 │       mov    %rsp,%rbp
>         │       push   %r15
>         │       push   %r14
>         │       push   %r13
>         │       push   %r12
>         │       push   %rbx
>    0.02 │       sub    $0x10,%rsp
>         │     struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
>   71.30 │       mov    0x60(%rdi),%r12
>         │     struct cpumask *cpumask;
>         │     int cid;
>         │
>         │     lockdep_assert_rq_held(rq);
>         │     cpumask = mm_cidmask(mm);
>         │     cid = __this_cpu_read(pcpu_cid->cid);
>   28.44 │       mov    %gs:0x8(%r12),%edx
>         │     if (mm_cid_is_valid(cid)) {
> 
> 
> sched_mm_cid_migrate_to() is 4.x% and its annotation :
> 
>         │     dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
>         │       mov     -0x30(%rbp),%rax
>   54.53 │       mov     0x60(%r13),%rbx
>   19.61 │       movslq  0xaf0(%rax),%r15
> 
> The reason why accessing mm->pcpu_cid is so costly is still a myth to
> me...

Then we clearly have another member of mm_struct on the same cache line 
as pcpu_cid which is bouncing all over the place and causing 
false-sharing. Any idea which field(s) are causing this ?


> 
> BTW, I used below diff to mitigate the incorrect rq lock issue I
> described in my reply to v8:

Yes, I'll do something similar in my next version, thanks ! I'll also 
drop my patch 1/2 from my patchset because clearly I was looking in the 
wrong place.

Thanks,

Mathieu

> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6e2dd8f4ee3..f16418731866 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -11662,7 +11662,7 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
>   		return;
>   	}
>   	/* Move src_cid to dst cpu. */
> -	mm_cid_snapshot_time(mm);
> +	mm_cid_snapshot_time(mm, cpu_of(dst_rq));
>   	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
>   }
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d1d470441422..8b6a0c8ed3d1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3348,12 +3348,13 @@ static inline int __mm_cid_try_get(struct mm_struct *mm)
>    * Save a snapshot of the current runqueue time of this cpu
>    * with the per-cpu cid value, allowing to estimate how recently it was used.
>    */
> -static inline void mm_cid_snapshot_time(struct mm_struct *mm)
> +static inline void mm_cid_snapshot_time(struct mm_struct *mm, int cpu)
>   {
> -	struct rq *rq = this_rq();
> +	struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
> +	struct rq *rq = cpu_rq(cpu);
>   
>   	lockdep_assert_rq_held(rq);
> -	__this_cpu_write(mm->pcpu_cid->time, rq->clock);
> +	WRITE_ONCE(pcpu_cid->time, rq->clock);
>   }
>   
>   static inline int __mm_cid_get(struct mm_struct *mm)
> @@ -3404,7 +3405,7 @@ static inline int __mm_cid_get(struct mm_struct *mm)
>   unlock:
>   	raw_spin_unlock(&cid_lock);
>   end:
> -	mm_cid_snapshot_time(mm);
> +	mm_cid_snapshot_time(mm, raw_smp_processor_id());
>   	return cid;
>   }
>   
> @@ -3419,7 +3420,7 @@ static inline int mm_cid_get(struct mm_struct *mm)
>   	cpumask = mm_cidmask(mm);
>   	cid = __this_cpu_read(pcpu_cid->cid);
>   	if (mm_cid_is_valid(cid)) {
> -		mm_cid_snapshot_time(mm);
> +		mm_cid_snapshot_time(mm, raw_smp_processor_id());
>   		return cid;
>   	}
>   	if (mm_cid_is_lazy_put(cid)) {
> @@ -3467,7 +3468,7 @@ static inline void switch_mm_cid(struct task_struct *prev,
>   		 */
>   	}
>   	if (prev->mm_cid_active) {
> -		mm_cid_snapshot_time(prev->mm);
> +		mm_cid_snapshot_time(prev->mm, raw_smp_processor_id());
>   		mm_cid_put_lazy(prev);
>   		prev->mm_cid = -1;
>   	}

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com