linux-kernel - [RFC PATCH 2/5] sched/fair: introduce sched-idle balance

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220217154403.6497-3-wuyun.abel@bytedance.com>
Date:   Thu, 17 Feb 2022 23:43:58 +0800
From:   Abel Wu <wuyun.abel@...edance.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>
Cc:     linux-kernel@...r.kernel.org
Subject: [RFC PATCH 2/5] sched/fair: introduce sched-idle balance

The goal of the sched-idle balancing is to let the non-idle
tasks make full use of cpu resources. To achieve that, we
mainly do two things:

  - pull non-idle tasks for sched-idle or idle rqs
    from the overloaded ones, and

  - prevent pulling the last non-idle task in an rq

We do sched-idle balance at normal load balancing and newly
idle if necessary. The idle balancing is ignored due to high
wakeup latency.

Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
---
 include/linux/sched/idle.h |   1 +
 kernel/sched/fair.c        | 128 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index d73d314d59c6..50ec5c770f85 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -8,6 +8,7 @@ enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
 	CPU_NEWLY_IDLE,
+	CPU_SCHED_IDLE,
 	CPU_MAX_IDLE_TYPES
 };
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0a0438c3319b..070a6fb1d2bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -456,6 +456,21 @@ static int se_is_idle(struct sched_entity *se)
 	return cfs_rq_is_idle(group_cfs_rq(se));
 }
 
+/* Is task idle from the top hierarchy POV */
+static int task_h_idle(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+
+	if (task_has_idle_policy(p))
+		return 1;
+
+	for_each_sched_entity(se)
+		if (cfs_rq_is_idle(cfs_rq_of(se)))
+			return 1;
+
+	return 0;
+}
+
 #else	/* !CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -508,6 +523,11 @@ static int se_is_idle(struct sched_entity *se)
 	return 0;
 }
 
+static inline int task_h_idle(struct task_struct *p)
+{
+	return task_has_idle_policy(p);
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
@@ -6974,6 +6994,11 @@ static inline int cfs_rq_overloaded(struct rq *rq)
 	return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running > 1;
 }
 
+static inline bool need_pull_cfs_task(struct rq *rq)
+{
+	return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
+}
+
 /* Must be called with rq locked */
 static void update_overload_status(struct rq *rq)
 {
@@ -7767,6 +7792,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (kthread_is_per_cpu(p))
 		return 0;
 
+	/*
+	 * Disregard hierarchically idle tasks during sched-idle
+	 * load balancing.
+	 */
+	if (env->idle == CPU_SCHED_IDLE && task_h_idle(p))
+		return 0;
+
+	/*
+	 * Skip p if it is the last non-idle task in src_rq. This
+	 * protects latency and throughput for non-idle tasks at
+	 * the cost of temporary load imbalance (which will probably
+	 * be fixed soon).
+	 */
+	if (!cfs_rq_overloaded(env->src_rq) && !task_h_idle(p))
+		return 0;
+
 	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
 		int cpu;
 
@@ -10265,6 +10306,83 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
 }
 
 /*
+ * The sched-idle balancing tries to eliminate overloaded cfs rqs
+ * by spreading out non-idle tasks prior to normal load balancing.
+ */
+static void sched_idle_balance(struct rq *dst_rq)
+{
+	struct sched_domain *sd;
+	struct task_struct *p;
+	int dst_cpu = cpu_of(dst_rq), cpu;
+
+	sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
+	if (unlikely(!sd))
+		return;
+
+	if (!atomic_read(&sd->shared->nr_overloaded))
+		return;
+
+	for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct lb_env env;
+
+		if (cpu == dst_cpu)
+			continue;
+
+		if (!cfs_rq_overloaded(rq))
+			continue;
+
+		rq_lock_irqsave(rq, &rf);
+
+		/*
+		 * Check again to ensure there are pullable tasks.
+		 * This is necessary because multiple rqs can pull
+		 * tasks at the same time. IOW contention on this
+		 * rq is heavy, so it would be better clear this
+		 * cpu from overloaded mask.
+		 */
+		if (unlikely(!cfs_rq_overloaded(rq))) {
+			update_overload_status(rq);
+			rq_unlock_irqrestore(rq, &rf);
+			continue;
+		}
+
+		env = (struct lb_env) {
+			.sd		= sd,
+			.dst_cpu	= dst_cpu,
+			.dst_rq		= dst_rq,
+			.src_cpu	= cpu,
+			.src_rq		= rq,
+			.idle		= CPU_SCHED_IDLE, /* non-idle only */
+			.flags		= LBF_DST_PINNED, /* pin dst_cpu */
+		};
+
+		update_rq_clock(rq);
+		p = detach_one_task(&env);
+
+		/*
+		 * Lazy updating overloaded mask here. If the rq is
+		 * still overloaded then we are just wasting cycles.
+		 * And it's OK even if the rq becomes un-overloaded
+		 * since the cost of peeking rq's data without lock
+		 * won't be much in next loops (during which the rq
+		 * can even be overloaded again).
+		 */
+
+		rq_unlock(rq, &rf);
+
+		if (p) {
+			attach_one_task(dst_rq, p);
+			local_irq_restore(rf.flags);
+			return;
+		}
+
+		local_irq_restore(rf.flags);
+	}
+}
+
+/*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
@@ -10284,6 +10402,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 	u64 max_cost = 0;
 
 	rcu_read_lock();
+
+	if (need_pull_cfs_task(rq))
+		sched_idle_balance(rq);
+
 	for_each_domain(cpu, sd) {
 		/*
 		 * Decay the newidle max times here because this is a regular
@@ -10913,6 +11035,12 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	update_blocked_averages(this_cpu);
 
 	rcu_read_lock();
+
+	sched_idle_balance(this_rq);
+	t1 = sched_clock_cpu(this_cpu);
+	curr_cost += t1 - t0;
+	t0 = t1;
+
 	for_each_domain(this_cpu, sd) {
 		int continue_balancing = 1;
 		u64 domain_cost;
-- 
2.11.0