linux-kernel - Re: [PATCH v4 7/7] sched/fair: de-entropy for SIS filter

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9e6b8d11-aa7a-fe6e-acc4-c226874d91f0@bytedance.com>
Date:   Thu, 30 Jun 2022 15:46:25 +0800
From:   Abel Wu <wuyun.abel@...edance.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Mel Gorman <mgorman@...e.de>,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Josh Don <joshdon@...gle.com>, Chen Yu <yu.c.chen@...el.com>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        K Prateek Nayak <kprateek.nayak@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4 7/7] sched/fair: de-entropy for SIS filter


On 6/19/22 8:04 PM, Abel Wu Wrote:
> Now when updating core state, there are two main problems that could
> pollute the SIS filter:
> 
>    - The updating is before task migration, so if dst_cpu is
>      selected to be propagated which might be fed with tasks
>      soon, the efforts we paid is no more than setting a busy
>      cpu into the SIS filter. While on the other hand it is
>      important that we update as early as possible to keep the
>      filter fresh, so it's not wise to delay the update to the
>      end of load balancing.
> 
>    - False negative propagation hurts performance since some
>      idle cpus could be out of reach. So in general we will
>      aggressively propagate idle cpus but allow false positive
>      continue to exist for a while, which may lead to filter
>      being fully polluted.
> 
> Pains can be relieved by a force correction when false positive
> continuously detected.
> 
> Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
> ---
>   include/linux/sched/topology.h |  7 +++++
>   kernel/sched/fair.c            | 51 ++++++++++++++++++++++++++++++++--
>   2 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index b93edf587d84..e3552ce192a9 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -91,6 +91,12 @@ struct sched_group;
>    *	search, and is also used as a fallback state of the other
>    *	states.
>    *
> + * - sd_may_idle
> + *	This state implies the unstableness of the SIS filter, and
> + *	some bits of it may out of date. This state is only used in
> + *	SMT domains as an intermediate state between sd_has_icpus
> + *	and sd_is_busy.
> + *
>    * - sd_is_busy
>    *	This state indicates there are no unoccupied cpus in this
>    *	domain. So for LLC domains, it gives the hint on whether
> @@ -111,6 +117,7 @@ struct sched_group;
>   enum sd_state {
>   	sd_has_icores,
>   	sd_has_icpus,
> +	sd_may_idle,
>   	sd_is_busy
>   };
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d55fdcedf2c0..9713d183d35e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8768,6 +8768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   
>   	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>   		struct rq *rq = cpu_rq(i);
> +		bool update = update_core && (env->dst_cpu != i);
>   
>   		sgs->group_load += cpu_load(rq);
>   		sgs->group_util += cpu_util_cfs(i);
> @@ -8777,7 +8778,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   		nr_running = rq->nr_running;
>   		sgs->sum_nr_running += nr_running;
>   
> -		if (update_core)
> +		/*
> +		 * The dst_cpu is not preferred since it might
> +		 * be fed with tasks soon.
> +		 */
> +		if (update)
>   			sd_classify(sds, rq, i);
>   
>   		if (nr_running > 1)
> @@ -8801,7 +8806,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   			 * and fed with tasks, so we'd better choose
>   			 * a candidate in an opposite way.
>   			 */
> -			sds->idle_cpu = i;
> +			if (update)
> +				sds->idle_cpu = i;
>   			sgs->idle_cpus++;
>   
>   			/* Idle cpu can't have misfit task */
> @@ -9321,7 +9327,7 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
>   {
>   	struct sched_domain_shared *sd_smt_shared = env->sd->shared;
>   	enum sd_state new = sds->sd_state;
> -	int this = env->dst_cpu;
> +	int icpu = sds->idle_cpu, this = env->dst_cpu;
>   
>   	/*
>   	 * Parallel updating can hardly contribute accuracy to
> @@ -9331,6 +9337,22 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
>   	if (cmpxchg(&sd_smt_shared->updating, 0, 1))
>   		return;
>   
> +	/*
> +	 * The dst_cpu is likely to be fed with tasks soon.
> +	 * If it is the only unoccupied cpu in this domain,
> +	 * we still handle it the same way as as_has_icpus
> +	 * but turn the SMT into the unstable state, rather
> +	 * than waiting to the end of load balancing since
> +	 * it's also important that update the filter as
> +	 * early as possible to keep it fresh.
> +	 */
> +	if (new == sd_is_busy) {
> +		if (idle_cpu(this) || sched_idle_cpu(this)) {
> +			new = sd_may_idle;
> +			icpu = this;
> +		}
> +	}
> +
>   	/*
>   	 * There is at least one unoccupied cpu available, so
>   	 * propagate it to the filter to avoid false negative
> @@ -9338,6 +9360,12 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
>   	 * idle cpus thus throughupt downgraded.
>   	 */
>   	if (new != sd_is_busy) {
> +		/*
> +		 * The sd_may_idle state is taken into
> +		 * consideration as well because from
> +		 * here we couldn't actually know task
> +		 * migrations would happen or not.
> +		 */
>   		if (!test_idle_cpus(this))
>   			set_idle_cpus(this, true);
>   	} else {
> @@ -9347,9 +9375,26 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
>   		 */
>   		if (sd_smt_shared->state == sd_is_busy)
>   			goto out;
> +
> +		/*
> +		 * Allow false positive to exist for some time
> +		 * to make a tradeoff of accuracy of the filter
> +		 * for relieving cache traffic.
> +		 */
> +		if (sd_smt_shared->state == sd_has_icpus) {
> +			new = sd_may_idle;
> +			goto update;
> +		}
> +
> +		/*
> +		 * If the false positive issue has already been
> +		 * there for a while, a correction of the filter
> +		 * is needed.
> +		 */
>   	}
>   
>   	sd_update_icpus(this, sds->idle_cpu);

The @icpu should be used here rather than 'sds->idle_cpu'..
Will be fixed in next version.

> +update:
>   	sd_smt_shared->state = new;
>   out:
>   	xchg(&sd_smt_shared->updating, 0);