lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220619120451.95251-6-wuyun.abel@bytedance.com>
Date:   Sun, 19 Jun 2022 20:04:49 +0800
From:   Abel Wu <wuyun.abel@...edance.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Mel Gorman <mgorman@...e.de>,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Josh Don <joshdon@...gle.com>, Chen Yu <yu.c.chen@...el.com>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        K Prateek Nayak <kprateek.nayak@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>,
        linux-kernel@...r.kernel.org, Abel Wu <wuyun.abel@...edance.com>
Subject: [PATCH v4 5/7] sched/fair: skip SIS domain search if fully busy

If a full scan on SIS domain failed, then no unoccupied cpus available
and the LLC is fully busy. In this case we'd better spend the time on
something more useful, rather than wasting it trying to find an idle
cpu that probably not exist.

The fully busy status will be re-evaluated when any core of this LLC
domain enters load balancing, and cleared once idle cpus found.

Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
---
 include/linux/sched/topology.h | 35 ++++++++++++++-
 kernel/sched/fair.c            | 82 +++++++++++++++++++++++++++++-----
 2 files changed, 104 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 56cffe42abbc..3e99ac98d766 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -77,10 +77,43 @@ extern int sched_domain_level_max;
 
 struct sched_group;
 
+/*
+ * States of the sched-domain
+ *
+ * - sd_has_icores
+ *	This state is only used in LLC domains to indicate worthy
+ *	of a full scan in SIS due to idle cores available.
+ *
+ * - sd_has_icpus
+ *	This state indicates that unoccupied (sched-idle/idle) cpus
+ *	might exist in this domain. For the LLC domains it is the
+ *	default state since these cpus are the main targets of SIS
+ *	search, and is also used as a fallback state of the other
+ *	states.
+ *
+ * - sd_is_busy
+ *	This state indicates there are no unoccupied cpus in this
+ *	domain. So for LLC domains, it gives the hint on whether
+ *	we should put efforts on the SIS search or not.
+ *
+ * For LLC domains, sd_has_icores is set when the last non-idle cpu of
+ * a core becomes idle. After a full SIS scan and if no idle cores found,
+ * sd_has_icores must be cleared and the state will be set to sd_has_icpus
+ * or sd_is_busy depending on whether there is any idle cpu. And during
+ * load balancing on each SMT domain inside the LLC, the state will be
+ * re-evaluated and switch from sd_is_busy to sd_has_icpus if idle cpus
+ * exist.
+ */
+enum sd_state {
+	sd_has_icores,
+	sd_has_icpus,
+	sd_is_busy
+};
+
 struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
-	int		has_idle_cores;
+	int		state;	/* see enum sd_state */
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1cc86e76e38e..2ca37fdc6c4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5642,11 +5642,15 @@ static inline void update_overutilized_status(struct rq *rq)
 static inline void update_overutilized_status(struct rq *rq) { }
 #endif
 
+static int unoccupied_rq(struct rq *rq)
+{
+	return rq->nr_running == rq->cfs.idle_h_nr_running;
+}
+
 /* Runqueue only has SCHED_IDLE tasks enqueued */
 static int sched_idle_rq(struct rq *rq)
 {
-	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
-			rq->nr_running);
+	return unlikely(rq->nr_running && unoccupied_rq(rq));
 }
 
 /*
@@ -6197,24 +6201,44 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 EXPORT_SYMBOL_GPL(sched_smt_present);
 
-static inline void set_idle_cores(int cpu, int val)
+static inline void sd_set_state(int cpu, enum sd_state state)
 {
 	struct sched_domain_shared *sds;
 
 	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 	if (sds)
-		WRITE_ONCE(sds->has_idle_cores, val);
+		WRITE_ONCE(sds->state, state);
 }
 
-static inline bool test_idle_cores(int cpu)
+static inline enum sd_state sd_get_state(int cpu)
 {
 	struct sched_domain_shared *sds;
 
 	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 	if (sds)
-		return READ_ONCE(sds->has_idle_cores);
+		return READ_ONCE(sds->state);
 
-	return false;
+	return sd_has_icpus;
+}
+
+static inline void set_idle_cores(int cpu, int idle)
+{
+	sd_set_state(cpu, idle ? sd_has_icores : sd_has_icpus);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+	return sd_get_state(cpu) == sd_has_icores;
+}
+
+static inline void set_idle_cpus(int cpu, int idle)
+{
+	sd_set_state(cpu, idle ? sd_has_icpus : sd_is_busy);
+}
+
+static inline bool test_idle_cpus(int cpu)
+{
+	return sd_get_state(cpu) != sd_is_busy;
 }
 
 /*
@@ -6298,7 +6322,7 @@ static int select_idle_smt(struct task_struct *p, int target)
 
 #else /* CONFIG_SCHED_SMT */
 
-static inline void set_idle_cores(int cpu, int val)
+static inline void set_idle_cores(int cpu, int idle)
 {
 }
 
@@ -6307,6 +6331,15 @@ static inline bool test_idle_cores(int cpu)
 	return false;
 }
 
+static inline void set_idle_cpus(int cpu, int idle)
+{
+}
+
+static inline bool test_idle_cpus(int cpu)
+{
+	return true;
+}
+
 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
 	return __select_idle_cpu(core, p);
@@ -6382,7 +6415,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		}
 	}
 
-	if (has_idle_core)
+	if (idle_cpu == -1)
+		set_idle_cpus(target, false);
+	else if (has_idle_core)
 		set_idle_cores(target, false);
 
 	if (sched_feat(SIS_PROP) && !has_idle_core) {
@@ -6538,6 +6573,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 			if ((unsigned int)i < nr_cpumask_bits)
 				return i;
 		}
+
+		if (!has_idle_core && !test_idle_cpus(target))
+			return target;
 	}
 
 	i = select_idle_cpu(p, sd, has_idle_core, target);
@@ -8303,6 +8341,8 @@ struct sd_lb_stats {
 	unsigned long avg_load;	/* Average load across all groups in sd */
 	unsigned int prefer_sibling; /* tasks should go to sibling first */
 
+	int sd_state;
+
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
 };
@@ -8321,6 +8361,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 		.local = NULL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
+		.sd_state = sd_is_busy,
 		.busiest_stat = {
 			.idle_cpus = UINT_MAX,
 			.group_type = group_has_spare,
@@ -8661,6 +8702,12 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
 	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
 }
 
+static inline void sd_classify(struct sd_lb_stats *sds, struct rq *rq)
+{
+	if (sds->sd_state != sd_has_icpus && unoccupied_rq(rq))
+		sds->sd_state = sd_has_icpus;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -8675,11 +8722,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 				      struct sg_lb_stats *sgs,
 				      int *sg_status)
 {
-	int i, nr_running, local_group;
+	int i, nr_running, local_group, update_core;
 
 	memset(sgs, 0, sizeof(*sgs));
 
 	local_group = group == sds->local;
+	update_core = env->sd->flags & SD_SHARE_CPUCAPACITY;
 
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
@@ -8692,6 +8740,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		if (update_core)
+			sd_classify(sds, rq);
+
 		if (nr_running > 1)
 			*sg_status |= SG_OVERLOAD;
 
@@ -9220,6 +9271,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	return idlest;
 }
 
+static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
+{
+	if (sds->sd_state == sd_has_icpus && !test_idle_cpus(env->dst_cpu))
+		set_idle_cpus(env->dst_cpu, true);
+}
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
@@ -9270,8 +9327,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	/* Tag domain that child domain prefers tasks go to siblings first */
 	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-
-	if (env->sd->flags & SD_NUMA)
+	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+		sd_update_state(env, sds);
+	else if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
 	if (!env->sd->parent) {
-- 
2.31.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ