linux-kernel - [RFC PATCH 2/3] sched/fair: Improve integration of SHARED_RUNQ feature within newidle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230831104508.7619-3-kprateek.nayak@amd.com>
Date:   Thu, 31 Aug 2023 16:15:07 +0530
From:   K Prateek Nayak <kprateek.nayak@....com>
To:     <void@...ifault.com>
CC:     <linux-kernel@...r.kernel.org>, <peterz@...radead.org>,
        <mingo@...hat.com>, <juri.lelli@...hat.com>,
        <vincent.guittot@...aro.org>, <dietmar.eggemann@....com>,
        <rostedt@...dmis.org>, <bsegall@...gle.com>, <mgorman@...e.de>,
        <bristot@...hat.com>, <vschneid@...hat.com>, <tj@...nel.org>,
        <roman.gushchin@...ux.dev>, <gautham.shenoy@....com>,
        <aaron.lu@...el.com>, <wuyun.abel@...edance.com>,
        <kernel-team@...a.com>, <kprateek.nayak@....com>
Subject: [RFC PATCH 2/3] sched/fair: Improve integration of SHARED_RUNQ feature within newidle_balance

This patch takes the relevant optimizations from [1] in
newidle_balance(). Following is the breakdown:

- Check "rq->rd->overload" before jumping into newidle_balance, even
  with SHARED_RQ feat enabled.

- Call update_next_balance() for all the domains till MC Domain in
  when SHARED_RQ path is taken.

- Account cost from shared_runq_pick_next_task() and update
  curr_cost and sd->max_newidle_lb_cost accordingly.

- Move the initial rq_unpin_lock() logic around. Also, the caller of
  shared_runq_pick_next_task() is responsible for calling
  rq_repin_lock() if the return value is non zero. (Needs to be verified
  everything is right with LOCKDEP)

- Includes a fix to skip directly above the LLC domain when calling the
  load_balance() in newidle_balance()

All other surgery from [1] has been removed.

Link: https://lore.kernel.org/all/31aeb639-1d66-2d12-1673-c19fed0ab33a@amd.com/ [1]
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c | 94 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf844ffa79c2..446ffdad49e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -337,7 +337,6 @@ static int shared_runq_pick_next_task(struct rq *rq, struct rq_flags *rf)
 		rq_unpin_lock(rq, &src_rf);
 		raw_spin_unlock_irqrestore(&p->pi_lock, src_rf.flags);
 	}
-	rq_repin_lock(rq, rf);
 
 	return ret;
 }
@@ -12276,50 +12275,83 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	if (!cpu_active(this_cpu))
 		return 0;
 
-	if (sched_feat(SHARED_RUNQ)) {
-		pulled_task = shared_runq_pick_next_task(this_rq, rf);
-		if (pulled_task)
-			return pulled_task;
-	}
-
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
-	/*
-	 * This is OK, because current is on_cpu, which avoids it being picked
-	 * for load-balance and preemption/IRQs are still disabled avoiding
-	 * further scheduler activity on it and we're being very careful to
-	 * re-start the picking loop.
-	 */
-	rq_unpin_lock(this_rq, rf);
-
 	rcu_read_lock();
-	sd = rcu_dereference_check_sched_domain(this_rq->sd);
-
-	/*
-	 * Skip <= LLC domains as they likely won't have any tasks if the
-	 * shared runq is empty.
-	 */
-	if (sched_feat(SHARED_RUNQ)) {
+	if (sched_feat(SHARED_RUNQ))
 		sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-		if (likely(sd))
-			sd = sd->parent;
-	}
+	else
+		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 
 	if (!READ_ONCE(this_rq->rd->overload) ||
-	    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+	    /* Look at rq->avg_idle iff SHARED_RUNQ is disabled */
+	    (!sched_feat(SHARED_RUNQ) && sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
 
-		if (sd)
+		while (sd) {
 			update_next_balance(sd, &next_balance);
+			sd = sd->child;
+		}
+
 		rcu_read_unlock();
 
 		goto out;
 	}
+
+	if (sched_feat(SHARED_RUNQ)) {
+		struct sched_domain *tmp = sd;
+
+		t0 = sched_clock_cpu(this_cpu);
+
+		/* Do update_next_balance() for all domains within LLC */
+		while (tmp) {
+			update_next_balance(tmp, &next_balance);
+			tmp = tmp->child;
+		}
+
+		pulled_task = shared_runq_pick_next_task(this_rq, rf);
+		if (pulled_task) {
+			if (sd) {
+				curr_cost = sched_clock_cpu(this_cpu) - t0;
+				/*
+				 * Will help bail out of scans of higer domains
+				 * slightly earlier.
+				 */
+				update_newidle_cost(sd, curr_cost);
+			}
+
+			rcu_read_unlock();
+			goto out_swq;
+		}
+
+		if (sd) {
+			t1 = sched_clock_cpu(this_cpu);
+			curr_cost += t1 - t0;
+			update_newidle_cost(sd, curr_cost);
+		}
+
+		/*
+		 * Since shared_runq_pick_next_task() can take a while
+		 * check if the CPU was targetted for a wakeup in the
+		 * meantime.
+		 */
+		if (this_rq->ttwu_pending) {
+			rcu_read_unlock();
+			return 0;
+		}
+	}
 	rcu_read_unlock();
 
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	rq_unpin_lock(this_rq, rf);
 	raw_spin_rq_unlock(this_rq);
 
 	t0 = sched_clock_cpu(this_cpu);
@@ -12335,6 +12367,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
 			break;
 
+		/*
+		 * Skip <= LLC domains as they likely won't have any tasks if the
+		 * shared runq is empty.
+		 */
+		if (sched_feat(SHARED_RUNQ) && (sd->flags & SD_SHARE_PKG_RESOURCES))
+			continue;
+
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 
 			pulled_task = load_balance(this_cpu, this_rq,
@@ -12361,6 +12400,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	raw_spin_rq_lock(this_rq);
 
+out_swq:
 	if (curr_cost > this_rq->max_idle_balance_cost)
 		this_rq->max_idle_balance_cost = curr_cost;
 
-- 
2.34.1