lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b6dafa6d-6d9a-3499-85b8-52c5163cd38d@intel.com>
Date:   Wed, 23 Oct 2019 15:50:28 +0800
From:   "Chen, Rong A" <rong.a.chen@...el.com>
To:     Vincent Guittot <vincent.guittot@...aro.org>,
        linux-kernel@...r.kernel.org, mingo@...hat.com,
        peterz@...radead.org
Cc:     pauld@...hat.com, valentin.schneider@....com,
        srikar@...ux.vnet.ibm.com, quentin.perret@....com,
        dietmar.eggemann@....com, Morten.Rasmussen@....com,
        hdanton@...a.com, parth@...ux.ibm.com, riel@...riel.com
Subject: Re: [PATCH] sched/fair: fix rework of find_idlest_group()

Tested-by: kernel test robot <rong.a.chen@...el.com>

On 10/23/2019 12:46 AM, Vincent Guittot wrote:
> The task, for which the scheduler looks for the idlest group of CPUs, must
> be discounted from all statistics in order to get a fair comparison
> between groups. This includes utilization, load, nr_running and idle_cpus.
>
> Such unfairness can be easily highlighted with the unixbench execl 1 task.
> This test continuously call execve() and the scheduler looks for the idlest
> group/CPU on which it should place the task. Because the task runs on the
> local group/CPU, the latter seems already busy even if there is nothing
> else running on it. As a result, the scheduler will always select another
> group/CPU than the local one.
>
> Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()")
> Reported-by: kernel test robot <rong.a.chen@...el.com>
> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> ---
>
> This recover most of the perf regression on my system and I have asked
> Rong if he can rerun the test with the patch to check that it fixes his
> system as well.
>
>   kernel/sched/fair.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 83 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a81c364..0ad4b21 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,6 +5379,36 @@ static unsigned long cpu_load(struct rq *rq)
>   {
>   	return cfs_rq_load_avg(&rq->cfs);
>   }
> +/*
> + * cpu_load_without - compute cpu load without any contributions from *p
> + * @cpu: the CPU which load is requested
> + * @p: the task which load should be discounted
> + *
> + * The load of a CPU is defined by the load of tasks currently enqueued on that
> + * CPU as well as tasks which are currently sleeping after an execution on that
> + * CPU.
> + *
> + * This method returns the load of the specified CPU by discounting the load of
> + * the specified task, whenever the task is currently contributing to the CPU
> + * load.
> + */
> +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
> +{
> +	struct cfs_rq *cfs_rq;
> +	unsigned int load;
> +
> +	/* Task has no contribution or is new */
> +	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
> +		return cpu_load(rq);
> +
> +	cfs_rq = &rq->cfs;
> +	load = READ_ONCE(cfs_rq->avg.load_avg);
> +
> +	/* Discount task's util from CPU's util */
> +	lsub_positive(&load, task_h_load(p));
> +
> +	return load;
> +}
>   
>   static unsigned long capacity_of(int cpu)
>   {
> @@ -8117,10 +8147,55 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
>   struct sg_lb_stats;
>   
>   /*
> + * task_running_on_cpu - return 1 if @p is running on @cpu.
> + */
> +
> +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
> +{
> +	/* Task has no contribution or is new */
> +	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
> +		return 0;
> +
> +	if (task_on_rq_queued(p))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/**
> + * idle_cpu_without - would a given CPU be idle without p ?
> + * @cpu: the processor on which idleness is tested.
> + * @p: task which should be ignored.
> + *
> + * Return: 1 if the CPU would be idle. 0 otherwise.
> + */
> +static int idle_cpu_without(int cpu, struct task_struct *p)
> +{
> +	struct rq *rq = cpu_rq(cpu);
> +
> +	if ((rq->curr != rq->idle) && (rq->curr != p))
> +		return 0;
> +
> +	/*
> +	 * rq->nr_running can't be used but an updated version without the
> +	 * impact of p on cpu must be used instead. The updated nr_running
> +	 * be computed and tested before calling idle_cpu_without().
> +	 */
> +
> +#ifdef CONFIG_SMP
> +	if (!llist_empty(&rq->wake_list))
> +		return 0;
> +#endif
> +
> +	return 1;
> +}
> +
> +/*
>    * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
> - * @denv: The ched_domain level to look for idlest group.
> + * @sd: The sched_domain level to look for idlest group.
>    * @group: sched_group whose statistics are to be updated.
>    * @sgs: variable to hold the statistics for this group.
> + * @p: The task for which we look for the idlest group/CPU.
>    */
>   static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>   					  struct sched_group *group,
> @@ -8133,21 +8208,22 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>   
>   	for_each_cpu(i, sched_group_span(group)) {
>   		struct rq *rq = cpu_rq(i);
> +		unsigned int local;
>   
> -		sgs->group_load += cpu_load(rq);
> +		sgs->group_load += cpu_load_without(rq, p);
>   		sgs->group_util += cpu_util_without(i, p);
> -		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> +		local = task_running_on_cpu(i, p);
> +		sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
>   
> -		nr_running = rq->nr_running;
> +		nr_running = rq->nr_running - local;
>   		sgs->sum_nr_running += nr_running;
>   
>   		/*
> -		 * No need to call idle_cpu() if nr_running is not 0
> +		 * No need to call idle_cpu_without() if nr_running is not 0
>   		 */
> -		if (!nr_running && idle_cpu(i))
> +		if (!nr_running && idle_cpu_without(i, p))
>   			sgs->idle_cpus++;
>   
> -
>   	}
>   
>   	/* Check if task fits in the group */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ