[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160616085040.GF30927@twins.programming.kicks-ass.net>
Date: Thu, 16 Jun 2016 10:50:40 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Yuyang Du <yuyang.du@...el.com>
Cc: Chris Wilson <chris@...is-wilson.co.uk>,
Andrey Ryabinin <aryabinin@...tuozzo.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Mike Galbraith <efault@....de>,
Thomas Gleixner <tglx@...utronix.de>, bsegall@...gle.com,
morten.rasmussen@....com, pjt@...gle.com, steve.muckle@...aro.org,
linux-kernel@...r.kernel.org, kernel@...p.com
Subject: Re: Divide-by-zero in post_init_entity_util_avg
On Thu, Jun 09, 2016 at 03:07:50PM +0200, Peter Zijlstra wrote:
> Which given the lack of serialization, and the code generated from
> update_cfs_rq_load_avg() is entirely possible.
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> sa->load_avg = max_t(long, sa->load_avg - r, 0);
> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> removed_load = 1;
> }
>
> turns into:
>
> ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax
> ffffffff8108706b: 48 85 c0 test %rax,%rax
> ffffffff8108706e: 74 40 je ffffffff810870b0 <update_blocked_averages+0xc0>
> ffffffff81087070: 4c 89 f8 mov %r15,%rax
> ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13)
> ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13)
> ffffffff8108707e: 4c 89 f9 mov %r15,%rcx
> ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx
> ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13)
> ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx
>
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.
>
> Ludicrous code generation if you ask me; I'd have expected something
> like (note, r15 holds 0):
>
> mov %r15, %rax
> xchg %rax, cfs_rq->removed_load_avg
> mov sa->load_avg, %rcx
> sub %rax, %rcx
> cmovs %r15, %rcx
> mov %rcx, sa->load_avg
So I _should_ have looked at other unserialized users of ->load_avg, but
alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.
Now, we really do not want to go grab rq->lock there, so I did the
below. Which actually ends up generating the 'right' code as per the
above.
3133: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13)
313a: 49 8b 4d 70 mov 0x70(%r13),%rcx
313e: bb 01 00 00 00 mov $0x1,%ebx
3143: 48 29 c1 sub %rax,%rcx
3146: 49 0f 48 cf cmovs %r15,%rcx
314a: 48 69 c0 82 45 ff ff imul $0xffffffffffff4582,%rax,%rax
3151: 49 89 4d 70 mov %rcx,0x70(%r13)
This ensures the negative value never hits memory and allows the
unserialized use.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
kernel/sched/fair.c | 30 ++++++++++++++++++++++--------
1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f75930bdd326..3fd3d903e6b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,6 +2878,20 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
}
}
+/*
+ * Explicitly do a load-store to ensure the temporary value never hits memory.
+ * This allows lockless observations without ever seeing the negative values.
+ *
+ * Incidentally, this also generates much saner code for x86.
+ */
+#define sub_positive(type, ptr, val) do { \
+ type tmp = READ_ONCE(*ptr); \
+ tmp -= (val); \
+ if (tmp < 0) \
+ tmp = 0; \
+ WRITE_ONCE(*ptr, tmp); \
+} while (0)
+
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2887,15 +2901,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(long, &sa->load_avg, r);
+ sub_positive(s64, &sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(long, &sa->util_avg, r);
+ sub_positive(s32, &sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
}
@@ -2968,10 +2982,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ sub_positive(long, &cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(s64, &cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(long, &cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(s32, &cfs_rq->avg.util_sum, se->avg.util_sum);
cfs_rq_util_change(cfs_rq);
}
Powered by blists - more mailing lists