lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160616085040.GF30927@twins.programming.kicks-ass.net>
Date:	Thu, 16 Jun 2016 10:50:40 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Yuyang Du <yuyang.du@...el.com>
Cc:	Chris Wilson <chris@...is-wilson.co.uk>,
	Andrey Ryabinin <aryabinin@...tuozzo.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Mike Galbraith <efault@....de>,
	Thomas Gleixner <tglx@...utronix.de>, bsegall@...gle.com,
	morten.rasmussen@....com, pjt@...gle.com, steve.muckle@...aro.org,
	linux-kernel@...r.kernel.org, kernel@...p.com
Subject: Re: Divide-by-zero in post_init_entity_util_avg

On Thu, Jun 09, 2016 at 03:07:50PM +0200, Peter Zijlstra wrote:
> Which given the lack of serialization, and the code generated from
> update_cfs_rq_load_avg() is entirely possible.
> 
> 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> 		removed_load = 1;
> 	}
> 
> turns into:
> 
> ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> ffffffff8108706b:       48 85 c0                test   %rax,%rax
> ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
> ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
> 
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.
> 
> Ludicrous code generation if you ask me; I'd have expected something
> like (note, r15 holds 0):
> 
> 	mov	%r15, %rax
> 	xchg	%rax, cfs_rq->removed_load_avg
> 	mov	sa->load_avg, %rcx
> 	sub	%rax, %rcx
> 	cmovs	%r15, %rcx
> 	mov	%rcx, sa->load_avg

So I _should_ have looked at other unserialized users of ->load_avg, but
alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.

Now, we really do not want to go grab rq->lock there, so I did the
below. Which actually ends up generating the 'right' code as per the
above.

    3133:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
    313a:       49 8b 4d 70             mov    0x70(%r13),%rcx
    313e:       bb 01 00 00 00          mov    $0x1,%ebx
    3143:       48 29 c1                sub    %rax,%rcx
    3146:       49 0f 48 cf             cmovs  %r15,%rcx
    314a:       48 69 c0 82 45 ff ff    imul   $0xffffffffffff4582,%rax,%rax
    3151:       49 89 4d 70             mov    %rcx,0x70(%r13)

This ensures the negative value never hits memory and allows the
unserialized use.

Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
 kernel/sched/fair.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f75930bdd326..3fd3d903e6b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,6 +2878,20 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	}
 }
 
+/*
+ * Explicitly do a load-store to ensure the temporary value never hits memory.
+ * This allows lockless observations without ever seeing the negative values.
+ *
+ * Incidentally, this also generates much saner code for x86.
+ */
+#define sub_positive(type, ptr, val) do {			\
+	type tmp = READ_ONCE(*ptr);				\
+	tmp -= (val);						\
+	if (tmp < 0)						\
+		tmp = 0;					\
+	WRITE_ONCE(*ptr, tmp);					\
+} while (0)
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2887,15 +2901,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-		sa->load_avg = max_t(long, sa->load_avg - r, 0);
-		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(long, &sa->load_avg, r);
+		sub_positive(s64,  &sa->load_sum, r * LOAD_AVG_MAX);
 		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-		sa->util_avg = max_t(long, sa->util_avg - r, 0);
-		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(long, &sa->util_avg, r);
+		sub_positive(s32,  &sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
 	}
 
@@ -2968,10 +2982,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+	sub_positive(long, &cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(s64,  &cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(long, &cfs_rq->avg.util_avg, se->avg.util_avg);
+	sub_positive(s32,  &cfs_rq->avg.util_sum, se->avg.util_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ