lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtCnN95A8kQ-uBA7ykTMAzQVRzwER-XNJt4YyQXdxhDCTQ@mail.gmail.com>
Date: Wed, 6 Mar 2024 18:40:08 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Shrikanth Hegde <sshegde@...ux.ibm.com>
Cc: mingo@...nel.org, peterz@...radead.org, yu.c.chen@...el.com, 
	dietmar.eggemann@....com, linux-kernel@...r.kernel.org, nysal@...ux.ibm.com, 
	aboorvad@...ux.ibm.com, srikar@...ux.ibm.com, vschneid@...hat.com, 
	pierre.gondois@....com, qyousef@...alina.io
Subject: Re: [PATCH v5 1/3] sched/fair: Add EAS checks before updating overutilized

On Wed, 6 Mar 2024 at 11:25, Shrikanth Hegde <sshegde@...ux.ibm.com> wrote:
>
> Overutilized field of root domain is only used for EAS(energy aware scheduler)
> to decide whether to do load balance or not. It is not used if EAS
> not possible.
>
> Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
> this field. In update_sd_lb_stats it is updated often. This causes cache
> contention due to true sharing and burns a lot of cycles. overload and
> overutilized are part of the same cacheline. Updating it often invalidates
> the cacheline. That causes access  to overload to slow down due to
> false sharing. Hence add EAS check before accessing/updating this field.
> EAS check is optimized at compile time or it is a static branch.
> Hence it shouldn't cost much.
>
> With the patch, both enqueue_task_fair and newidle_balance don't show
> up as hot routines in perf profile.
>
> 6.8-rc4:
> 7.18%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
> 6.78%  s                [kernel.vmlinux]              [k] newidle_balance
> +patch:
> 0.14%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
> 0.00%  swapper          [kernel.vmlinux]              [k] newidle_balance
>
> Minor change: trace_sched_overutilized_tp expect that second argument to
> be bool. So do a int to bool conversion for that.
>
> Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
> Reviewed-by: Qais Yousef <qyousef@...alina.io>
> Reviewed-by: Srikar Dronamraju <srikar@...ux.ibm.com>
> Signed-off-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
> ---
>  kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 43 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a16129f9a5c..997e570d9423 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6663,22 +6663,51 @@ static inline void hrtick_update(struct rq *rq)
>  #ifdef CONFIG_SMP
>  static inline bool cpu_overutilized(int cpu)
>  {
> -       unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> -       unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> +       unsigned long  rq_util_min, rq_util_max;
> +
> +       if (!sched_energy_enabled())
> +               return false;
> +
> +       rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> +       rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
>
>         /* Return true only if the utilization doesn't fit CPU's capacity */
>         return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
>  }
>
> -static inline void update_overutilized_status(struct rq *rq)
> +static inline void set_rd_overutilized_status(struct root_domain *rd,
> +                                             unsigned int status)
>  {
> -       if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> -               WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> -       }
> +       if (!sched_energy_enabled())
> +               return;
> +
> +       WRITE_ONCE(rd->overutilized, status);
> +       trace_sched_overutilized_tp(rd, !!status);
> +}
> +
> +static inline void check_update_overutilized_status(struct rq *rq)
> +{
> +       /*
> +        * overutilized field is used for load balancing decisions only
> +        * if energy aware scheduler is being used
> +        */
> +       if (!sched_energy_enabled())
> +               return;
> +
> +       if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
> +               set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
>  }
>  #else
> -static inline void update_overutilized_status(struct rq *rq) { }
> +static inline void check_update_overutilized_status(struct rq *rq)
> +{
> +       return 0;
> +}

static inline void check_update_overutilized_status(struct rq *rq) { }

> +
> +static inline void set_rd_overutilized_status(struct root_domain *rd,
> +                                             unsigned int status)
> +{
> +       return 0;
> +}

static inline void set_rd_overutilized_status(struct rq *rq) { }

my comment on v4 about {return 0; } applies only for static inline int
is_rd_overutilized(struct root_domain *rd)

Also, I don't think that set_rd_overutilized_status() is called
outside #ifdef CONFIG_SMP so you can remove it.




>  #endif
>
>  /* Runqueue only has SCHED_IDLE tasks enqueued */
> @@ -6779,7 +6808,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>          * and the following generally works well enough in practice.
>          */
>         if (!task_new)
> -               update_overutilized_status(rq);
> +               check_update_overutilized_status(rq);
>
>  enqueue_throttle:
>         assert_list_leaf_cfs_rq(rq);
> @@ -10596,19 +10625,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
>         if (!env->sd->parent) {
> -               struct root_domain *rd = env->dst_rq->rd;
> -
>                 /* update overload indicator if we are at root domain */
> -               WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
> +               WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
>
>                 /* Update over-utilization (tipping point, U >= 0) indicator */
> -               WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
> +               set_rd_overutilized_status(env->dst_rq->rd,
> +                                          sg_status & SG_OVERUTILIZED);
>         } else if (sg_status & SG_OVERUTILIZED) {
> -               struct root_domain *rd = env->dst_rq->rd;
> -
> -               WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
> +               set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
>         }
>
>         update_idle_cpu_scan(env, sum_util);
> @@ -12609,7 +12633,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>                 task_tick_numa(rq, curr);
>
>         update_misfit_status(curr, rq);
> -       update_overutilized_status(task_rq(curr));
> +       check_update_overutilized_status(task_rq(curr));
>
>         task_tick_core(rq, curr);
>  }
> --
> 2.39.3
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ