linux-kernel - Re: [PATCH v5 2/2] sched/fair: Scan cluster before scanning LLC in wake-up path

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <3d612766-592f-760f-ca43-4ea2257a8bb2@huawei.com>
Date:   Thu, 14 Jul 2022 17:41:04 +0800
From:   Yicong Yang <yangyicong@...wei.com>
To:     <peterz@...radead.org>, <mingo@...hat.com>,
        <juri.lelli@...hat.com>, <vincent.guittot@...aro.org>,
        <tim.c.chen@...ux.intel.com>, <gautham.shenoy@....com>,
        <linux-kernel@...r.kernel.org>,
        <linux-arm-kernel@...ts.infradead.org>
CC:     <dietmar.eggemann@....com>, <rostedt@...dmis.org>,
        <bsegall@...gle.com>, <bristot@...hat.com>,
        <prime.zeng@...wei.com>, <jonathan.cameron@...wei.com>,
        <ego@...ux.vnet.ibm.com>, <srikar@...ux.vnet.ibm.com>,
        <linuxarm@...wei.com>, <21cnbao@...il.com>,
        <guodong.xu@...aro.org>, <hesham.almatary@...wei.com>,
        <john.garry@...wei.com>, <shenyang39@...wei.com>,
        <kprateek.nayak@....com>, <yu.c.chen@...el.com>,
        <wuyun.abel@...edance.com>, Yicong Yang <yangyicong@...ilicon.com>
Subject: Re: [PATCH v5 2/2] sched/fair: Scan cluster before scanning LLC in
 wake-up path

Hi,

a friendly ping..

On 2022/6/30 14:55, Yicong Yang wrote:
> From: Barry Song <song.bao.hua@...ilicon.com>
> 
> For platforms having clusters like Kunpeng920, CPUs within the same cluster
> have lower latency when synchronizing and accessing shared resources like
> cache. Thus, this patch tries to find an idle cpu within the cluster of the
> target CPU before scanning the whole LLC to gain lower latency.
> 
> Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so this patch
> doesn't consider SMT for this moment.
> 
> Testing has been done on Kunpeng920 by pinning tasks to one numa and two
> numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs.
> 
> With this patch, We noticed enhancement on tbench within one numa or cross
> two numa.
> 
> On numa 0:
>                            tip/core                 patched
> Hmean     1        345.89 (   0.00%)      393.96 *  13.90%*
> Hmean     2        697.77 (   0.00%)      786.04 *  12.65%*
> Hmean     4       1392.51 (   0.00%)     1570.26 *  12.76%*
> Hmean     8       2800.61 (   0.00%)     3083.98 *  10.12%*
> Hmean     16      5514.27 (   0.00%)     6116.00 *  10.91%*
> Hmean     32     10869.81 (   0.00%)    10782.98 *  -0.80%*
> Hmean     64      8315.22 (   0.00%)     8519.84 *   2.46%*
> Hmean     128     6324.47 (   0.00%)     7159.35 *  13.20%*
> 
> On numa 0-1:
>                            tip/core                 patched
> Hmean     1        348.68 (   0.00%)      387.91 *  11.25%*
> Hmean     2        693.57 (   0.00%)      774.91 *  11.73%*
> Hmean     4       1369.26 (   0.00%)     1475.48 *   7.76%*
> Hmean     8       2772.99 (   0.00%)     2984.61 *   7.63%*
> Hmean     16      4825.83 (   0.00%)     5873.13 *  21.70%*
> Hmean     32     10250.32 (   0.00%)    11688.06 *  14.03%*
> Hmean     64     16309.51 (   0.00%)    19889.48 *  21.95%*
> Hmean     128    13022.32 (   0.00%)    16005.64 *  22.91%*
> Hmean     256    11335.79 (   0.00%)    13821.74 *  21.93%*
> 
> Tested-by: Yicong Yang <yangyicong@...ilicon.com>
> Signed-off-by: Barry Song <song.bao.hua@...ilicon.com>
> Signed-off-by: Yicong Yang <yangyicong@...ilicon.com>
> Reviewed-by: Tim Chen <tim.c.chen@...ux.intel.com>
> ---
>  kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f80ae86bb404..dff5dec0d792 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6323,6 +6323,40 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
>  
>  #endif /* CONFIG_SCHED_SMT */
>  
> +#ifdef CONFIG_SCHED_CLUSTER
> +/*
> + * Scan the cluster domain for idle CPUs and clear cluster cpumask after scanning
> + */
> +static inline int scan_cluster(struct task_struct *p, struct cpumask *cpus,
> +			       int target, int *nr)
> +{
> +	struct sched_domain *sd = rcu_dereference(per_cpu(sd_cluster, target));
> +	int cpu, idle_cpu;
> +
> +	/* TODO: Support SMT system with cluster topology */
> +	if (!sched_smt_active() && sd) {
> +		for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) {
> +			if (!--*nr)
> +				return -1;
> +
> +			idle_cpu = __select_idle_cpu(cpu, p);
> +			if ((unsigned int)idle_cpu < nr_cpumask_bits)
> +				return idle_cpu;
> +		}
> +
> +		cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +	}
> +
> +	return -1;
> +}
> +#else
> +static inline int scan_cluster(struct task_struct *p, struct cpumask *cpus,
> +			       int target, int *nr)
> +{
> +	return -1;
> +}
> +#endif
> +
>  /*
>   * Scan the LLC domain for idle CPUs; this is dynamically regulated by
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> @@ -6383,6 +6417,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  		}
>  	}
>  
> +	idle_cpu = scan_cluster(p, cpus, target, &nr);
> +	if ((unsigned int)idle_cpu < nr_cpumask_bits)
> +		return idle_cpu;
> +
>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>  		if (has_idle_core) {
>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
> @@ -6390,7 +6428,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  				return i;
>  
>  		} else {
> -			if (!--nr)
> +			if (--nr <= 0)
>  				return -1;
>  			idle_cpu = __select_idle_cpu(cpu, p);
>  			if ((unsigned int)idle_cpu < nr_cpumask_bits)
> @@ -6489,7 +6527,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	/*
>  	 * If the previous CPU is cache affine and idle, don't be stupid:
>  	 */
> -	if (prev != target && cpus_share_cache(prev, target) &&
> +	if (prev != target && cpus_share_lowest_cache(prev, target) &&
>  	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
>  	    asym_fits_capacity(task_util, prev))
>  		return prev;
> @@ -6515,7 +6553,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	p->recent_used_cpu = prev;
>  	if (recent_used_cpu != prev &&
>  	    recent_used_cpu != target &&
> -	    cpus_share_cache(recent_used_cpu, target) &&
> +	    cpus_share_lowest_cache(recent_used_cpu, target) &&
>  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
>  	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
>  	    asym_fits_capacity(task_util, recent_used_cpu)) {
>