linux-kernel - Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <Y4CDZXZJpPB0J1BV@hirez.programming.kicks-ass.net>
Date:   Fri, 25 Nov 2022 09:57:09 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Josh Don <joshdon@...gle.com>
Cc:     Chengming Zhou <zhouchengming@...edance.com>,
        Ingo Molnar <mingo@...hat.com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Valentin Schneider <vschneid@...hat.com>,
        linux-kernel@...r.kernel.org, Tejun Heo <tj@...nel.org>,
        Michal Koutný <mkoutny@...e.com>,
        Christian Brauner <brauner@...nel.org>,
        Zefan Li <lizefan.x@...edance.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Frederic Weisbecker <fweisbec@...il.com>,
        anna-maria@...utronix.de
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > Yep, this tradeoff feels "best", but there are some edge cases where
> > this could potentially disrupt fairness. For example, if we have
> > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > so that this will continually hit the same cpu.
> 
> We could -- if we wanted to -- manually rotate the timer around the
> relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> though I'm afraid.

Here; something like so should enable us to cycle the bandwidth timer.
Just need to figure out a way to find another CPU or something.

---
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f8bd200d678a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -63,8 +63,10 @@ enum hrtimer_mode {
  * Return values for the callback function
  */
 enum hrtimer_restart {
-	HRTIMER_NORESTART,	/* Timer is not restarted */
-	HRTIMER_RESTART,	/* Timer must be restarted */
+	HRTIMER_RESTART = -1,		/* Timer must be restarted */
+	HRTIMER_NORESTART = 0,		/* Timer is not restarted */
+	HRTIMER_RESTART_MIGRATE = 1,
+	HRTIMER_RESTART_MIGRATE_MAX = HRTIMER_RESTART_MIGRATE + NR_CPUS,
 };
 
 /*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e75033f78a19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1621,6 +1621,16 @@ bool hrtimer_active(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_active);
 
+static void raw_spin_lock_double(raw_spinlock_t *a, raw_spinlock_t *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	raw_spin_lock(a);
+	if (b != a)
+		raw_spin_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 /*
  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
  * distinct sections:
@@ -1644,6 +1654,8 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer *timer, ktime_t *now,
 			  unsigned long flags) __must_hold(&cpu_base->lock)
 {
+	struct hrtimer_cpu_base *new_cpu_base = cpu_base;
+	struct hrtimer_clock_base *new_base = base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	bool expires_in_hardirq;
 	int restart;
@@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 
 	lockdep_hrtimer_exit(expires_in_hardirq);
 	trace_hrtimer_expire_exit(timer);
-	raw_spin_lock_irq(&cpu_base->lock);
+
+	local_irq_disable();
+
+	if (restart >= HRTIMER_RESTART_MIGRATE) {
+		int cpu = restart - HRTIMER_RESTART_MIGRATE;
+		int b = base - cpu_base->clock_base;
+
+		new_cpu_base = &per_cpu(hrtimer_bases, cpu);
+		new_base = new_cpu_base->clock_base[b];
+	}
+	raw_spin_lock_double(&cpu_base->lock, &new_cpu_base->lock);
 
 	/*
 	 * Note: We clear the running state after enqueue_hrtimer and
@@ -1698,8 +1720,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	 * for us already.
 	 */
 	if (restart != HRTIMER_NORESTART &&
-	    !(timer->state & HRTIMER_STATE_ENQUEUED))
-		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+	    !(timer->state & HRTIMER_STATE_ENQUEUED)) {
+
+		if (new_cpu_base != cpu_base) {
+			timer->base = new_base;
+			enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+			raw_spin_unlock(&new_cpu_base->lock);
+		} else {
+			enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+		}
+	}
 
 	/*
 	 * Separate the ->running assignment from the ->state assignment.
@@ -2231,12 +2261,8 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	local_irq_disable();
 	old_base = &per_cpu(hrtimer_bases, scpu);
 	new_base = this_cpu_ptr(&hrtimer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	raw_spin_lock(&new_base->lock);
-	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+	raw_spin_lock_double(&old_base->lock, &new_base->lock);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],