linux-kernel - Re: [RFC PATCH] sched: Pass affine target cpu into wake

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1262853903.18931.17.camel@minggr.sh.intel.com>
Date:	Thu, 07 Jan 2010 16:45:03 +0800
From:	Lin Ming <ming.m.lin@...el.com>
To:	Mike Galbraith <efault@....de>
Cc:	Peter Zijlstra <peterz@...radead.org>,
	lkml <linux-kernel@...r.kernel.org>,
	"Zhang, Yanmin" <yanmin_zhang@...ux.intel.com>
Subject: Re: [RFC PATCH] sched: Pass affine target cpu into wake_affine

On Tue, 2010-01-05 at 14:43 +0800, Mike Galbraith wrote:
> On Tue, 2010-01-05 at 04:44 +0100, Mike Galbraith wrote:
> > On Tue, 2010-01-05 at 10:48 +0800, Lin Ming wrote:
> > > On Mon, 2010-01-04 at 17:03 +0800, Lin Ming wrote:
> > > > commit a03ecf08d7bbdd979d81163ea13d194fe21ad339
> > > > Author: Lin Ming <ming.m.lin@...el.com>
> > > > Date:   Mon Jan 4 14:14:50 2010 +0800
> > > > 
> > > >     sched: Pass affine target cpu into wake_affine
> > > >     
> > > >     Since commit a1f84a3(sched: Check for an idle shared cache in select_task_rq_fair()),
> > > >     the affine target maybe adjusted to any idle cpu in cache sharing domains
> > > >     instead of current cpu.
> > > >     But wake_affine still use current cpu to calculate load which is wrong.
> > > >     
> > > >     This patch passes affine cpu into wake_affine.
> > > >     
> > > >     Signed-off-by: Lin Ming <ming.m.lin@...el.com>
> > > 
> > > Mike,
> > > 
> > > Any comment of this patch?
> > 
> > The patch definitely looks like the right thing to do, but when I tried
> > this, it didn't work out well.  Since I can't seem to recall precise
> > details, I'll let my box either remind me or give it's ack.
> 
> Unfortunately, box reminded me.  mysql+oltp peak throughput with
> nr_clients == nr_cpus

Did you test with your vmark regression fix patch also applied?

I tested on below 2 machines with the 2 patches both applied and the
oltp(sysbench+mysql) data shows good.
Tigerton x86_64 machine: 16cpus(4P/4Cores), 40G mem
IA64 machine: 32cpus(4P/4Cores/HT), 16G mem

Compared with upstream 2.6.33-rc2, IA64 improves ~15% and Tigerton
improves ~3%.

The 2 patches are merged as below,

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 57e6357..5b81156 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,7 +99,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
-				| 0*SD_SHARE_PKG_RESOURCES		\
+				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9..cbf4bd2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1237,11 +1237,11 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 #endif
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int affine_cpu, int sync)
 {
 	struct task_struct *curr = current;
-	unsigned long this_load, load;
-	int idx, this_cpu, prev_cpu;
+	unsigned long affine_load, load;
+	int idx, prev_cpu;
 	unsigned long tl_per_task;
 	unsigned int imbalance;
 	struct task_group *tg;
@@ -1249,10 +1249,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	int balanced;
 
 	idx	  = sd->wake_idx;
-	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
 	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	affine_load = target_load(affine_cpu, idx);
 
 	if (sync) {
 	       if (sched_feat(SYNC_LESS) &&
@@ -1275,7 +1274,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		tg = task_group(current);
 		weight = current->se.load.weight;
 
-		this_load += effective_load(tg, this_cpu, -weight, -weight);
+		affine_load += effective_load(tg, affine_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
@@ -1285,16 +1284,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
 
 	/*
-	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped this_load to 0, we'll
+	 * In low-load situations, where prev_cpu is idle and affine_cpu is idle
+	 * due to the sync cause above having dropped affine_load to 0, we'll
 	 * always have an imbalance, but there's really nothing you can do
 	 * about that, so that's good too.
 	 *
 	 * Otherwise check if either cpus are near enough in load to allow this
-	 * task to be woken on this_cpu.
+	 * task to be woken on affine_cpu.
 	 */
-	balanced = !this_load ||
-		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
+	balanced = !affine_load ||
+		100*(affine_load + effective_load(tg, affine_cpu, weight, weight)) <=
 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
@@ -1306,11 +1305,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		return 1;
 
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
-	tl_per_task = cpu_avg_load_per_task(this_cpu);
+	tl_per_task = cpu_avg_load_per_task(affine_cpu);
 
 	if (balanced ||
-	    (this_load <= load &&
-	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
+	    (affine_load <= load &&
+	     affine_load + target_load(prev_cpu, idx) <= tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
@@ -1508,7 +1507,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			 * If there's an idle sibling in this domain, make that
 			 * the wake_affine target instead of the current cpu.
 			 */
-			if (tmp->flags & SD_PREFER_SIBLING)
+			if (tmp->flags & SD_SHARE_PKG_RESOURCES)
 				target = select_idle_sibling(p, tmp, target);
 
 			if (target >= 0) {
@@ -1544,7 +1543,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			update_shares(tmp);
 	}
 
-	if (affine_sd && wake_affine(affine_sd, p, sync))
+	if (affine_sd && wake_affine(affine_sd, p, cpu, sync))
 		return cpu;
 
 	while (sd) {



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/