[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1212838682.5571.6.camel@marge.simson.net>
Date: Sat, 07 Jun 2008 13:38:02 +0200
From: Mike Galbraith <efault@....de>
To: Greg Smith <gsmith@...gsmith.com>
Cc: Ingo Molnar <mingo@...e.hu>, Peter Zijlstra <peterz@...radead.org>,
Dhaval Giani <dhaval@...ux.vnet.ibm.com>,
lkml <linux-kernel@...r.kernel.org>,
Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
Subject: Re: [patch] Re: PostgreSQL pgbench performance regression in
2.6.23+
On Fri, 2008-06-06 at 08:13 +0200, Mike Galbraith wrote:
> On Fri, 2008-06-06 at 01:03 -0400, Greg Smith wrote:
>
> > I think I might not be testing exactly the same thing you did, though,
> > because the pattern doesn't match. I think that my Q6600 system runs a
> > little bit faster than yours, which is the case for small numbers of
> > clients here. But once we get above 8 clients your setup is way faster,
> > with the difference at 15 clients being the largest. Were you perhaps
> > using batch mode when you generated these results?
>
> No, those were with stock settings.
>
> > Regardless, clearly your patch reduces the regression with the default
> > parameters to a mild one instead of the gigantic one we started with.
>
> Unfortunately, after the recent reverts, we're right back to huge :-/
>
> I'm trying to come up with a dirt simple solution that doesn't harm
> other load types.
The below doesn't hurt my volanomark numbers of the day, helps pgbench
considerably, and improves the higher client end of mysql+oltp a wee
bit. It may hurt the low end a wee bit, but the low end is always
pretty unstable, so it's hard to tell with only three runs.
pgbench
2.6.26-rc5 2.6.26-rc5+
1 10213.768037 10237.990274 10165.511814 10183.705908
2 15885.949053 15519.005195 14994.697875 15204.900479
3 15663.233356 16043.733087 16554.371722 17279.376443
4 14193.807355 15799.792612 18447.345925 18088.861169
5 17239.456219 17326.938538 20119.250823 18537.351094
6 15293.624093 14272.208159 21439.841579 22634.887824
8 12483.727461 13486.991527 25579.379337 25908.373483
10 11919.023584 12058.503518 23876.035623 22403.867804
15 10128.724654 11253.959398 23276.797649 23595.597093
20 9645.056147 9980.465235 23603.315133 23256.506240
30 9288.747962 8801.059613 23633.448266 23229.286697
40 8494.705123 8323.107702 22925.552706 23081.526954
50 8357.781935 8239.867147 19102.481374 19558.624434
volanomark
2.6.26-rc5
test-1.log:Average throughput = 101768 messages per second
test-2.log:Average throughput = 99124 messages per second
test-3.log:Average throughput = 99821 messages per second
test-1.log:Average throughput = 101362 messages per second
test-2.log:Average throughput = 98891 messages per second
test-3.log:Average throughput = 99164 messages per second
2.6.26-rc5+
test-1.log:Average throughput = 103275 messages per second
test-2.log:Average throughput = 100034 messages per second
test-3.log:Average throughput = 99434 messages per second
test-1.log:Average throughput = 100460 messages per second
test-2.log:Average throughput = 100188 messages per second
test-3.log:Average throughput = 99617 messages per second
Index: linux-2.6.26.git/kernel/sched_fair.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched_fair.c
+++ linux-2.6.26.git/kernel/sched_fair.c
@@ -664,6 +664,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
update_stats_dequeue(cfs_rq, se);
if (sleep) {
+ se->last_preempter = NULL;
update_avg_stats(cfs_rq, se);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
@@ -692,8 +693,10 @@ check_preempt_tick(struct cfs_rq *cfs_rq
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec > ideal_runtime) {
+ curr->last_preempter = NULL;
resched_task(rq_of(cfs_rq)->curr);
+ }
}
static void
@@ -994,6 +997,7 @@ wake_affine(struct rq *rq, struct sched_
unsigned int imbalance)
{
struct task_struct *curr = this_rq->curr;
+ struct sched_entity *se = &curr->se, *pse = &p->se;
unsigned long tl = this_load;
unsigned long tl_per_task;
int balanced;
@@ -1002,14 +1006,26 @@ wake_affine(struct rq *rq, struct sched_
return 0;
/*
+ * If the current task is being wakeup preempted by multiple tasks
+ * that it awakened, such that it can't get significant work done
+ * between preemptions, try to spread these preemption sources.
+ */
+ if (sync && se->last_preempter && se->last_preempter != pse) {
+ u64 se_last_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+ if (se_last_exec < sysctl_sched_migration_cost)
+ return 0;
+ }
+
+ /*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync)
- tl -= current->se.load.weight;
+ tl -= se->load.weight;
- balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+ balanced = 100*(tl + pse->load.weight) <= imbalance*load;
/*
* If the currently running task will sleep within
@@ -1017,8 +1033,8 @@ wake_affine(struct rq *rq, struct sched_
* woken task:
*/
if (sync && balanced && curr->sched_class == &fair_sched_class) {
- if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost)
+ if (se->avg_overlap < sysctl_sched_migration_cost &&
+ pse->avg_overlap < sysctl_sched_migration_cost)
return 1;
}
@@ -1219,8 +1235,27 @@ static void check_preempt_wakeup(struct
pse = parent_entity(pse);
}
- if (wakeup_preempt_entity(se, pse) == 1)
- resched_task(curr);
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ int preempt = 1;
+
+ /*
+ * If current task is being prempted by multiple wakees,
+ * tag it for 1:N affine wakeup preemption avoidance.
+ */
+ if (se->last_preempter && se->last_preempter != pse &&
+ se->load.weight >= pse->load.weight) {
+ u64 exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+ if (exec < sysctl_sched_migration_cost)
+ preempt = 0;
+ }
+
+ if (se == ¤t->se)
+ se->last_preempter = pse;
+
+ if (preempt)
+ resched_task(curr);
+ }
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
Index: linux-2.6.26.git/include/linux/sched.h
===================================================================
--- linux-2.6.26.git.orig/include/linux/sched.h
+++ linux-2.6.26.git/include/linux/sched.h
@@ -963,6 +963,7 @@ struct sched_entity {
u64 last_wakeup;
u64 avg_overlap;
+ struct sched_entity *last_preempter;
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
Index: linux-2.6.26.git/kernel/sched.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched.c
+++ linux-2.6.26.git/kernel/sched.c
@@ -2176,6 +2176,7 @@ static void __sched_fork(struct task_str
p->se.prev_sum_exec_runtime = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
+ p->se.last_preempter = NULL;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists