linux-kernel - [PATCH v4 7/7] sched/fair: de-entropy for SIS filter

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220619120451.95251-8-wuyun.abel@bytedance.com>
Date:   Sun, 19 Jun 2022 20:04:51 +0800
From:   Abel Wu <wuyun.abel@...edance.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Mel Gorman <mgorman@...e.de>,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Josh Don <joshdon@...gle.com>, Chen Yu <yu.c.chen@...el.com>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        K Prateek Nayak <kprateek.nayak@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>,
        linux-kernel@...r.kernel.org, Abel Wu <wuyun.abel@...edance.com>
Subject: [PATCH v4 7/7] sched/fair: de-entropy for SIS filter

Now when updating core state, there are two main problems that could
pollute the SIS filter:

  - The updating is before task migration, so if dst_cpu is
    selected to be propagated which might be fed with tasks
    soon, the efforts we paid is no more than setting a busy
    cpu into the SIS filter. While on the other hand it is
    important that we update as early as possible to keep the
    filter fresh, so it's not wise to delay the update to the
    end of load balancing.

  - False negative propagation hurts performance since some
    idle cpus could be out of reach. So in general we will
    aggressively propagate idle cpus but allow false positive
    continue to exist for a while, which may lead to filter
    being fully polluted.

Pains can be relieved by a force correction when false positive
continuously detected.

Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
---
 include/linux/sched/topology.h |  7 +++++
 kernel/sched/fair.c            | 51 ++++++++++++++++++++++++++++++++--
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index b93edf587d84..e3552ce192a9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -91,6 +91,12 @@ struct sched_group;
  *	search, and is also used as a fallback state of the other
  *	states.
  *
+ * - sd_may_idle
+ *	This state implies the unstableness of the SIS filter, and
+ *	some bits of it may out of date. This state is only used in
+ *	SMT domains as an intermediate state between sd_has_icpus
+ *	and sd_is_busy.
+ *
  * - sd_is_busy
  *	This state indicates there are no unoccupied cpus in this
  *	domain. So for LLC domains, it gives the hint on whether
@@ -111,6 +117,7 @@ struct sched_group;
 enum sd_state {
 	sd_has_icores,
 	sd_has_icpus,
+	sd_may_idle,
 	sd_is_busy
 };
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d55fdcedf2c0..9713d183d35e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8768,6 +8768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
+		bool update = update_core && (env->dst_cpu != i);
 
 		sgs->group_load += cpu_load(rq);
 		sgs->group_util += cpu_util_cfs(i);
@@ -8777,7 +8778,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
-		if (update_core)
+		/*
+		 * The dst_cpu is not preferred since it might
+		 * be fed with tasks soon.
+		 */
+		if (update)
 			sd_classify(sds, rq, i);
 
 		if (nr_running > 1)
@@ -8801,7 +8806,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			 * and fed with tasks, so we'd better choose
 			 * a candidate in an opposite way.
 			 */
-			sds->idle_cpu = i;
+			if (update)
+				sds->idle_cpu = i;
 			sgs->idle_cpus++;
 
 			/* Idle cpu can't have misfit task */
@@ -9321,7 +9327,7 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	struct sched_domain_shared *sd_smt_shared = env->sd->shared;
 	enum sd_state new = sds->sd_state;
-	int this = env->dst_cpu;
+	int icpu = sds->idle_cpu, this = env->dst_cpu;
 
 	/*
 	 * Parallel updating can hardly contribute accuracy to
@@ -9331,6 +9337,22 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
 	if (cmpxchg(&sd_smt_shared->updating, 0, 1))
 		return;
 
+	/*
+	 * The dst_cpu is likely to be fed with tasks soon.
+	 * If it is the only unoccupied cpu in this domain,
+	 * we still handle it the same way as as_has_icpus
+	 * but turn the SMT into the unstable state, rather
+	 * than waiting to the end of load balancing since
+	 * it's also important that update the filter as
+	 * early as possible to keep it fresh.
+	 */
+	if (new == sd_is_busy) {
+		if (idle_cpu(this) || sched_idle_cpu(this)) {
+			new = sd_may_idle;
+			icpu = this;
+		}
+	}
+
 	/*
 	 * There is at least one unoccupied cpu available, so
 	 * propagate it to the filter to avoid false negative
@@ -9338,6 +9360,12 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
 	 * idle cpus thus throughupt downgraded.
 	 */
 	if (new != sd_is_busy) {
+		/*
+		 * The sd_may_idle state is taken into
+		 * consideration as well because from
+		 * here we couldn't actually know task
+		 * migrations would happen or not.
+		 */
 		if (!test_idle_cpus(this))
 			set_idle_cpus(this, true);
 	} else {
@@ -9347,9 +9375,26 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
 		 */
 		if (sd_smt_shared->state == sd_is_busy)
 			goto out;
+
+		/*
+		 * Allow false positive to exist for some time
+		 * to make a tradeoff of accuracy of the filter
+		 * for relieving cache traffic.
+		 */
+		if (sd_smt_shared->state == sd_has_icpus) {
+			new = sd_may_idle;
+			goto update;
+		}
+
+		/*
+		 * If the false positive issue has already been
+		 * there for a while, a correction of the filter
+		 * is needed.
+		 */
 	}
 
 	sd_update_icpus(this, sds->idle_cpu);
+update:
 	sd_smt_shared->state = new;
 out:
 	xchg(&sd_smt_shared->updating, 0);
-- 
2.31.1