linux-kernel - Re: [RFC PATCH v9 2/2] sched: Fix performance regression introduced by mm

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20230420095610.GA153295@ziqianlu-desk2>
Date:   Thu, 20 Apr 2023 17:56:10 +0800
From:   Aaron Lu <aaron.lu@...el.com>
To:     Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
CC:     Peter Zijlstra <peterz@...radead.org>,
        <linux-kernel@...r.kernel.org>, Olivier Dion <odion@...icios.com>,
        <michael.christie@...cle.com>
Subject: Re: [RFC PATCH v9 2/2] sched: Fix performance regression introduced
 by mm_cid

On Wed, Apr 19, 2023 at 11:50:12AM -0400, Mathieu Desnoyers wrote:
> Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
> sysbench regression reported by Aaron Lu.

mm_cid_get() dropped to 5.x% after I disable CONFIG_DEBUG_PREEMPT, using
__this_cpu_X() doesn't help, I suppose that is because __this_cpu_X()
still needs to fetch mm->pcpu_cid.

Annotate mm_cid_get():

       │     static inline int mm_cid_get(struct mm_struct *mm)
       │     {
  0.05 │       push   %rbp
  0.02 │       mov    %rsp,%rbp
       │       push   %r15
       │       push   %r14
       │       push   %r13
       │       push   %r12
       │       push   %rbx
  0.02 │       sub    $0x10,%rsp
       │     struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
 71.30 │       mov    0x60(%rdi),%r12
       │     struct cpumask *cpumask;
       │     int cid;
       │
       │     lockdep_assert_rq_held(rq);
       │     cpumask = mm_cidmask(mm);
       │     cid = __this_cpu_read(pcpu_cid->cid);
 28.44 │       mov    %gs:0x8(%r12),%edx
       │     if (mm_cid_is_valid(cid)) {


sched_mm_cid_migrate_to() is 4.x% and its annotation :

       │     dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
       │       mov     -0x30(%rbp),%rax
 54.53 │       mov     0x60(%r13),%rbx
 19.61 │       movslq  0xaf0(%rax),%r15

The reason why accessing mm->pcpu_cid is so costly is still a myth to
me...

BTW, I used below diff to mitigate the incorrect rq lock issue I
described in my reply to v8:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6e2dd8f4ee3..f16418731866 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11662,7 +11662,7 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
 		return;
 	}
 	/* Move src_cid to dst cpu. */
-	mm_cid_snapshot_time(mm);
+	mm_cid_snapshot_time(mm, cpu_of(dst_rq));
 	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d1d470441422..8b6a0c8ed3d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3348,12 +3348,13 @@ static inline int __mm_cid_try_get(struct mm_struct *mm)
  * Save a snapshot of the current runqueue time of this cpu
  * with the per-cpu cid value, allowing to estimate how recently it was used.
  */
-static inline void mm_cid_snapshot_time(struct mm_struct *mm)
+static inline void mm_cid_snapshot_time(struct mm_struct *mm, int cpu)
 {
-	struct rq *rq = this_rq();
+	struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+	struct rq *rq = cpu_rq(cpu);
 
 	lockdep_assert_rq_held(rq);
-	__this_cpu_write(mm->pcpu_cid->time, rq->clock);
+	WRITE_ONCE(pcpu_cid->time, rq->clock);
 }
 
 static inline int __mm_cid_get(struct mm_struct *mm)
@@ -3404,7 +3405,7 @@ static inline int __mm_cid_get(struct mm_struct *mm)
 unlock:
 	raw_spin_unlock(&cid_lock);
 end:
-	mm_cid_snapshot_time(mm);
+	mm_cid_snapshot_time(mm, raw_smp_processor_id());
 	return cid;
 }
 
@@ -3419,7 +3420,7 @@ static inline int mm_cid_get(struct mm_struct *mm)
 	cpumask = mm_cidmask(mm);
 	cid = __this_cpu_read(pcpu_cid->cid);
 	if (mm_cid_is_valid(cid)) {
-		mm_cid_snapshot_time(mm);
+		mm_cid_snapshot_time(mm, raw_smp_processor_id());
 		return cid;
 	}
 	if (mm_cid_is_lazy_put(cid)) {
@@ -3467,7 +3468,7 @@ static inline void switch_mm_cid(struct task_struct *prev,
 		 */
 	}
 	if (prev->mm_cid_active) {
-		mm_cid_snapshot_time(prev->mm);
+		mm_cid_snapshot_time(prev->mm, raw_smp_processor_id());
 		mm_cid_put_lazy(prev);
 		prev->mm_cid = -1;
 	}