linux-kernel - Re: volanoMark regression with kernel 2.6.26-rc1

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:	Thu, 15 May 2008 22:40:42 +0530
From:	Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
To:	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:	Mike Galbraith <efault@....de>,
	"Zhang, Yanmin" <yanmin_zhang@...ux.intel.com>,
	Dhaval Giani <dhaval@...ux.vnet.ibm.com>,
	LKML <linux-kernel@...r.kernel.org>, Ingo Molnar <mingo@...e.hu>,
	Aneesh Kumar KV <aneesh.kumar@...ux.vnet.ibm.com>
Subject: Re: volanoMark regression with kernel 2.6.26-rc1

On Thu, May 15, 2008 at 10:41:38AM +0200, Peter Zijlstra wrote:
> Yes - and that should not be too big an issue as long as we can deal
> with it.
> 
> Any number we'll put to it will be based on a snapshot of the state so
> we're wrong no matter what we do. The trick is trying to keep sane.

more than staleness, interspersing of writes/reads is my concern.

Lets say that CPU0 is updating tg->cfs_rq[0]->aggregate.load,shares,rq_weight 
at CPU domain (comprising of cpu 0-7). That will result in writes in this order:

	->rq_weight = ?
	->task_weight = ?
	->shares = ?
	->load = ?

At the same time, CPU1 could be doing a load_balance_fair() in SMT
domain, reading the above same words in this order:

	->rq_weight
	->load
	->task_weight
	->shares

What if the writes (on cpu0) and reads (on cpu1) are interspersed? Won't
it give incoherent results for the reader?

> My current stack on top of sched-devel:
> 
> http://programming.kicks-ass.net/kernel-patches/sched-smp-group-fixes/

Doesnt improve things here (8-way Intel Xeon with SCHED_SMT set):

2.6.25 							     : 21762.4
2.6.26-rc1 + sched_devel 				     : 17937.5 (-17.6%)
2.6.26-rc1 + sched_devel + your patches 		     : 17047   (-21.6%)
2.6.26-rc1 + sched_devel + patch_below  		     : 18368.9 (-15.6%)
2.6.26-rc1 + sched_devel + patch_below + ^NORMALIZED_SLEEPER : 19589.6 (-9.9%)

I will check if patch_below + your patches help close down the gap (tomorrow):


---
 kernel/sched.c      |   98 ++++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c |   26 +++++--------
 2 files changed, 61 insertions(+), 63 deletions(-)

Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -1568,12 +1568,12 @@ static int task_hot(struct task_struct *
  */
 
 static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int this_cpu)
 {
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+	return &tg->cfs_rq[this_cpu]->aggregate;
 }
 
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *, int);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1581,14 +1581,14 @@ typedef void (*aggregate_func)(struct ta
  */
 static
 void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
+			 struct sched_domain *sd, int this_cpu)
 {
 	struct task_group *parent, *child;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, sd);
+	(*down)(parent, sd, this_cpu);
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1596,7 +1596,7 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, sd);
+	(*up)(parent, sd, this_cpu);
 
 	child = parent;
 	parent = parent->parent;
@@ -1609,7 +1609,8 @@ up:
  * Calculate the aggregate runqueue weight.
  */
 static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd,
+				int this_cpu)
 {
 	unsigned long rq_weight = 0;
 	unsigned long task_weight = 0;
@@ -1620,15 +1621,16 @@ void aggregate_group_weight(struct task_
 		task_weight += tg->cfs_rq[i]->task_weight;
 	}
 
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
+	aggregate(tg, this_cpu)->rq_weight = rq_weight;
+	aggregate(tg, this_cpu)->task_weight = task_weight;
 }
 
 /*
  * Compute the weight of this group on the given cpus.
  */
 static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd,
+				int this_cpu)
 {
 	unsigned long shares = 0;
 	int i;
@@ -1636,10 +1638,11 @@ void aggregate_group_shares(struct task_
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+	if ((!shares && aggregate(tg, this_cpu)->rq_weight) ||
+						 shares > tg->shares)
 		shares = tg->shares;
 
-	aggregate(tg, sd)->shares = shares;
+	aggregate(tg, this_cpu)->shares = shares;
 }
 
 /*
@@ -1647,7 +1650,8 @@ void aggregate_group_shares(struct task_
  * weight and this group's parent's load, i.e. top-down.
  */
 static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd,
+					int this_cpu)
 {
 	unsigned long load;
 
@@ -1659,17 +1663,17 @@ void aggregate_group_load(struct task_gr
 			load += cpu_rq(i)->load.weight;
 
 	} else {
-		load = aggregate(tg->parent, sd)->load;
+		load = aggregate(tg->parent, this_cpu)->load;
 
 		/*
 		 * shares is our weight in the parent's rq so
 		 * shares/parent->rq_weight gives our fraction of the load
 		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+		load *= aggregate(tg, this_cpu)->shares;
+		load /= aggregate(tg->parent, this_cpu)->rq_weight + 1;
 	}
 
-	aggregate(tg, sd)->load = load;
+	aggregate(tg, this_cpu)->load = load;
 }
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1679,7 +1683,7 @@ static void __set_se_shares(struct sched
  */
 static void
 __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
+			  int tcpu, int this_cpu)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1706,8 +1710,8 @@ __update_group_shares_cpu(struct task_gr
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
+	shares = aggregate(tg, this_cpu)->shares * rq_weight;
+	shares /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
@@ -1734,8 +1738,8 @@ __move_group_shares(struct task_group *t
 
 	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
 
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
+	__update_group_shares_cpu(tg, sd, scpu, dcpu);
+	__update_group_shares_cpu(tg, sd, dcpu, dcpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
@@ -1761,9 +1765,10 @@ move_group_shares(struct task_group *tg,
 }
 
 static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd,
+					int this_cpu)
 {
-	unsigned long shares = aggregate(tg, sd)->shares;
+	unsigned long shares = aggregate(tg, this_cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1771,20 +1776,20 @@ void aggregate_group_set_shares(struct t
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
+		__update_group_shares_cpu(tg, sd, i, this_cpu);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-	aggregate_group_shares(tg, sd);
+	aggregate_group_shares(tg, sd, this_cpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
 	 * above redistribution.
 	 */
-	shares -= aggregate(tg, sd)->shares;
+	shares -= aggregate(tg, this_cpu)->shares;
 	if (shares) {
 		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
+		aggregate(tg, this_cpu)->shares += shares;
 	}
 }
 
@@ -1793,20 +1798,22 @@ void aggregate_group_set_shares(struct t
  * while walking down the tree.
  */
 static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd,
+				 int this_cpu)
 {
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
+	aggregate_group_weight(tg, sd, this_cpu);
+	aggregate_group_shares(tg, sd, this_cpu);
+	aggregate_group_load(tg, sd, this_cpu);
 }
 
 /*
  * Rebalance the cpu shares while walking back up the tree.
  */
 static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd,
+				 int this_cpu)
 {
-	aggregate_group_set_shares(tg, sd);
+	aggregate_group_set_shares(tg, sd, this_cpu);
 }
 
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1819,18 +1826,15 @@ static void __init init_aggregate(void)
 		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
 
-static int get_aggregate(struct sched_domain *sd)
+static void get_aggregate(struct sched_domain *sd, int this_cpu)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
-	return 1;
+	spin_lock(&per_cpu(aggregate_lock, this_cpu));
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd, this_cpu);
 }
 
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(struct sched_domain *sd, int this_cpu)
 {
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+	spin_unlock(&per_cpu(aggregate_lock, this_cpu));
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1844,12 +1848,12 @@ static inline void init_aggregate(void)
 {
 }
 
-static inline int get_aggregate(struct sched_domain *sd)
+static void get_aggregate(struct sched_domain *sd, int this_cpu)
 {
 	return 0;
 }
 
-static inline void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(struct sched_domain *sd, int this_cpu)
 {
 }
 #endif
@@ -3635,11 +3639,10 @@ static int load_balance(int this_cpu, st
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
+	get_aggregate(sd, this_cpu);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3777,8 +3780,7 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (unlock_aggregate)
-		put_aggregate(sd);
+	put_aggregate(sd, this_cpu);
 	return ld_moved;
 }
 
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c
+++ current/kernel/sched_fair.c
@@ -1388,28 +1388,24 @@ load_balance_fair(struct rq *this_rq, in
 
 	rcu_read_lock();
 	list_for_each_entry(tg, &task_groups, list) {
-		long imbalance;
-		unsigned long this_weight, busiest_weight;
+		unsigned long busiest_weight;
 		long rem_load, max_load, moved_load;
 
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, sd)->task_weight)
+		if (!aggregate(tg, this_cpu)->task_weight || !busiest_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
-
-		this_weight = tg->cfs_rq[this_cpu]->task_weight;
-		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-
-		imbalance = (busiest_weight - this_weight) / 2;
+		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
-		if (imbalance < 0)
-			imbalance = busiest_weight;
+		if (!rem_load)
+			continue;
 
-		max_load = max(rem_load, imbalance);
+		max_load = rem_load;
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				max_load, sd, idle, all_pinned, this_best_prio,
 				tg->cfs_rq[busiest_cpu]);
@@ -1419,8 +1415,8 @@ load_balance_fair(struct rq *this_rq, in
 
 		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		moved_load *= aggregate(tg, this_cpu)->load;
+		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)




-- 
Regards,
vatsa
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/