linux-kernel - Re: [PATCH 2/2] sched/fair: Always propagate runnable_load

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170428203845.GA22354@htj.duckdns.org>
Date:   Fri, 28 Apr 2017 16:38:45 -0400
From:   Tejun Heo <tj@...nel.org>
To:     Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Mike Galbraith <efault@....de>, Paul Turner <pjt@...gle.com>,
        Chris Mason <clm@...com>, kernel-team@...com
Subject: Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg

Here's the debug patch.

The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.

 /sys/module/fair/parameters/dbg_odd_cnt: resettable counter
 /sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
					  occurrence via trace_printk()

The load / weights are printed out so that NICE_0_LOAD is 1.000.

Thanks.
---
 kernel/sched/fair.c |  160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/moduleparam.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 
+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
 	sgs->group_type = group_classify(group, sgs);
 }
 
+static int count_schb(struct rq *rq)
+{
+	unsigned long flags;
+	struct task_struct *p;
+	int cnt = 0;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+		if (!strncmp(p->comm, "schbench", 8))
+			cnt++;
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+	int cpu;
+
+	for_each_cpu(cpu, sched_group_cpus(sg))
+		if (count_schb(cpu_rq(cpu)) >= 2)
+			return true;
+	return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x)	(int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+#define lba(x)	(int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+			 int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...)	do {				\
+	cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args);		\
+	cnt = min_t(int, cnt, size);					\
+} while (0)
+
+	if (entity_is_task(se)) {
+		struct task_struct *task = task_of(se);
+		odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+	} else {
+		char nbuf[64];
+		cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+		odd_append(" %s(%s)", nbuf, postfix);
+	}
+	odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+		   lbw(se->load.weight),
+		   lba(se->avg.load_avg),
+		   lba(se->avg.util_avg));
+
+	return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+			 struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+	int cpu;
+
+	trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref,
+		     cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+		     lba(sgs->avg_load), lba(sgs->group_load),
+		     lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+	trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n",
+		     pref,
+		     lba(sgs->group_capacity), lba(sgs->group_util),
+		     sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+		     sgs->group_type, sgs->group_no_capacity);
+
+	for_each_cpu(cpu, sched_group_cpus(sg)) {
+		struct task_group *tg;
+		unsigned long flags;
+
+		trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+			     cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+		raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+		list_for_each_entry_rcu(tg, &task_groups, list) {
+			struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+			char qname[32] = "root";
+			int depth = 0;
+			long tg_weight = 0, tg_shares = 0;
+			struct sched_entity *se;
+			char *buf = per_cpu_ptr(odd_buf, cpu);
+			int cnt;
+
+			if (!cfs_rq->nr_running)
+				continue;
+
+			if (cfs_rq->tg) {
+				cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname));
+				if (cfs_rq->tg->se[cpu])
+					depth = cfs_rq->tg->se[cpu]->depth;
+				tg_weight = atomic_long_read(&cfs_rq->tg->load_avg);
+				tg_shares = cfs_rq->tg->shares;
+			}
+
+			trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n",
+				     pref, cpu, qname, depth,
+				     lbw(cfs_rq->load.weight),
+				     lba(cfs_rq->avg.load_avg),
+				     lba(cfs_rq->avg.util_avg),
+				     lba(cfs_rq->runnable_load_avg),
+				     cfs_rq->nr_running, cfs_rq->h_nr_running,
+				     lbw(tg_shares),
+				     lba(tg_weight));
+
+			buf[0] = '\0';
+			cnt = 0;
+
+			if (cfs_rq->curr)
+				cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE);
+
+			for (se = __pick_first_entity(cfs_rq); se;
+			     se = __pick_next_entity(se))
+				cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE);
+
+			trace_printk("%sQ%03d-%s@%d: %s\n",
+				     pref, cpu, qname, depth, buf);
+		}
+
+		raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+	}
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+		    struct sched_group *sga, struct sg_lb_stats *sgsa,
+		    struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+	if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+		return;
+
+	trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+		     env->dst_cpu, env->idle, env->loop_break,
+		     cpumask_pr_args(env->cpus), env->fbq_type);
+	dbg_odd_dump("A: ", sga, sgsa);
+	dbg_odd_dump("B: ", sgb, sgsb);
+}
+
 /**
  * update_sd_pick_busiest - return 1 on busiest group
  * @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
 				   struct sg_lb_stats *sgs)
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
+	bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+	bool sg_has_two = sg_has_two_schb(sg);
 
 	if (sgs->group_type > busiest->group_type)
 		return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
-	if (sgs->avg_load <= busiest->avg_load)
+	if (sgs->avg_load <= busiest->avg_load) {
+		if (sg_has_two && !busiest_has_two)
+			dbg_odd(env, sg, sgs, sds->busiest, busiest);
 		return false;
+	}
+
+	if (!sg_has_two && busiest_has_two)
+		dbg_odd(env, sds->busiest, busiest, sg, sgs);
 
 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 		goto asym_packing;