lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20070307183256.pkas62fucy2vugqg@m.safari.iki.fi>
Date:	Wed, 7 Mar 2007 20:32:56 +0200
From:	Sami Farin <7atbggg02@...akemail.com>
To:	linux-kernel@...r.kernel.org, netdev@...r.kernel.org
Subject: Re: [RFC] div64_64 support

On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
> Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> >> when doing 1000 loops test... gcc-4.0.3 works.
> > 
> > Found it.
> > 
> > --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >  
> >  	__asm__("bsrl %1,%0\n\t"
> >  		"cmovzl %2,%0"
> > -		: "=&r" (r) : "rm" (x), "rm" (-1));
> > +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> >  	return r+1;
> >  }
> >  
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> 
> Can you post the difference in the generated code with that change?

Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times.  So it gets better timings ( 42/666=0 ) =)

--- cbrt-test-no_memory.s	2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s	2007-03-07 20:22:38.237013197 +0200
...
 main:
 	leal	4(%esp), %ecx
 	andl	$-16, %esp
 	pushl	-4(%ecx)
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	pushl	%ebx
 	pushl	%ecx
-	subl	$136, %esp
+	subl	$152, %esp
 	movl	$.LC0, (%esp)
 	call	puts
 	xorl	%edx, %edx
 	movl	$27, %eax
 	call	ncubic
 	cmpl	$3, %eax
-	je	.L83
+	je	.L87
 	movl	$.LC1, (%esp)
 	call	puts
-.L83:
-	xorl	%eax, %eax
-	xorl	%edi, %edi
-	movl	%eax, 88(%esp)
+.L87:
 	xorl	%eax, %eax
-	xorl	%esi, %esi
+	xorl	%ebp, %ebp
 	movl	%eax, 92(%esp)
 	xorl	%eax, %eax
-	xorl	%ebp, %ebp
+	xorl	%edi, %edi
 	movl	%eax, 96(%esp)
 	xorl	%eax, %eax
+	xorl	%esi, %esi
 	movl	%eax, 100(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 104(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 108(%esp)
-	movl	%edi, 112(%esp)
-	movl	%esi, 116(%esp)
-	.p2align 4,,15
-.L84:
+	xorl	%eax, %eax
+	movl	%eax, 112(%esp)
+	movl	%ebp, 116(%esp)
+	movl	%edi, 120(%esp)
+	movl	%esi, 124(%esp)
+.L88:
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, 56(%esp)
 	movl	%edx, 60(%esp)
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, %esi
 	movl	%edx, %edi
 	subl	56(%esp), %esi
 	sbbl	60(%esp), %edi
 	cmpl	$0, %edi
 	ja	.L66
 	cmpl	$999, %esi
-	jbe	.L84
+	jbe	.L88
 .L66:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+4(,%eax,4), %edi
+	movl	cases(,%eax,4), %esi
+	movl	%edi, %edx
+	movl	%esi, %eax
+	call	ncubic
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	movl	%eax, %esi
-	movl	%edx, %edi
+	movl	$666, %ebx
+	movl	%eax, 128(%esp)
+	movl	%edx, 132(%esp)
+	.p2align 4,,15
+.L67:
+	movl	%esi, %eax
+	movl	%edi, %edx
+	call	ncubic
+	decl	%ebx
+	movl	%eax, %ebp
+	jne	.L67
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	subl	%esi, %eax
+	subl	128(%esp), %eax
 	movl	$666, %ebx
-	sbbl	%edi, %edx
-	xorl	%ecx, %ecx
 	movl	%ebx, 8(%esp)
+	sbbl	132(%esp), %edx
+	xorl	%ecx, %ecx
 	movl	%ecx, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	call	__udivdi3
-	addl	%eax, 104(%esp)
+	addl	%eax, 112(%esp)
 	movl	%edx, %ecx
 	movl	%eax, %ebx
 	movl	%edx, %esi
-	adcl	%edx, 108(%esp)
+	adcl	%edx, 116(%esp)
 	imull	%eax, %ecx
 	mull	%ebx
 	addl	%ecx, %ecx
 	movl	%eax, 56(%esp)
 	addl	%ecx, %edx
 	movl	56(%esp), %eax
-	addl	%eax, 112(%esp)
+	addl	%eax, 120(%esp)
 	movl	%edx, 60(%esp)
 	movl	60(%esp), %edx
-	adcl	%edx, 116(%esp)
-	cmpl	%esi, 92(%esp)
-	ja	.L67
-	jb	.L68
-	cmpl	%ebx, 88(%esp)
-	jae	.L67
-.L68:
-	movl	%ebx, 88(%esp)
-	movl	%esi, 92(%esp)
-.L67:
-	leal	(%ebp,%ebp,2), %ebx
-	sall	$2, %ebx
-	movl	cases+4(%ebx), %edx
-	movl	cases(%ebx), %eax
-	call	ncubic
-	movl	cases+8(%ebx), %edx
-	subl	%eax, %edx
-	movl	%edx, %eax
-	sarl	$31, %eax
-	xorl	%eax, %edx
-	subl	%eax, %edx
-	movl	%edx, %ecx
-	sarl	$31, %ecx
-	addl	%edx, 96(%esp)
-	adcl	%ecx, 100(%esp)
-	incl	%ebp
-	cmpl	$183, %ebp
-	jbe	.L84
-	movl	108(%esp), %eax
-	fildll	104(%esp)
-	testl	%eax, %eax
-	js	.L85
+	adcl	%edx, 124(%esp)
+	cmpl	%esi, 100(%esp)
+	ja	.L69
+	jb	.L70
+	cmpl	%ebx, 96(%esp)
+	jae	.L69
 .L70:
-	fstpl	120(%esp)
+	movl	%ebx, 96(%esp)
+	movl	%esi, 100(%esp)
+.L69:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+8(,%eax,4), %eax
+	subl	%ebp, %eax
+	movl	%eax, %ecx
+	sarl	$31, %ecx
+	xorl	%ecx, %eax
+	subl	%ecx, %eax
+	cltd
+	addl	%eax, 104(%esp)
+	adcl	%edx, 108(%esp)
+	incl	92(%esp)
+	cmpl	$183, 92(%esp)
+	jbe	.L88
 	movl	116(%esp), %eax
-	fldl	120(%esp)
+	fildll	112(%esp)
+	testl	%eax, %eax
+	js	.L89
+.L72:
+	fstpl	136(%esp)
+	movl	124(%esp), %eax
+	fldl	136(%esp)
 	fdivl	.LC7
 	testl	%eax, %eax
 	flds	.LC4
 	fdivr	%st, %st(1)
-	fildll	112(%esp)
-	js	.L86
-.L71:
-	fstpl	120(%esp)
-	fldl	120(%esp)
+	fildll	120(%esp)
+	js	.L90
+.L73:
+	fstpl	136(%esp)
+	fldl	136(%esp)
 	fdivl	.LC7
 	fdivp	%st, %st(1)
 	fld	%st(1)
 	fmul	%st(2), %st
 	fsubrp	%st, %st(1)
 	fld	%st(0)
 	fsqrt
 	fucomi	%st(0), %st
-	jp	.L88
-	je	.L89
-.L88:
+	jp	.L92
+	je	.L93
+.L92:
 	fstp	%st(0)
 	fstpl	(%esp)
 	fstpl	64(%esp)
 	call	sqrt
 	fldl	64(%esp)
 	fxch	%st(1)
-.L72:
-	movl	96(%esp), %eax
-	movl	100(%esp), %edx
-	fildll	88(%esp)
+.L74:
+	movl	104(%esp), %eax
+	movl	108(%esp), %edx
+	fildll	96(%esp)
 	movl	%eax, 40(%esp)
-	movl	92(%esp), %eax
+	movl	100(%esp), %eax
 	movl	%edx, 44(%esp)
 	testl	%eax, %eax
-	js	.L87
-.L73:
-	fstpl	120(%esp)
-	movl	104(%esp), %eax
+	js	.L91
+.L75:
+	fstpl	136(%esp)
+	movl	112(%esp), %eax
 	movl	$184, %ebp
-	fldl	120(%esp)
+	fldl	136(%esp)
 	xorl	%edi, %edi
 	movl	$.LC5, %esi
 	fdivl	.LC7
-	movl	108(%esp), %edx
+	movl	116(%esp), %edx
 	movl	%ebp, 8(%esp)
 	movl	%edi, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	fstpl	32(%esp)
 	fstpl	24(%esp)
 	fstpl	16(%esp)
 	call	__udivdi3
 	movl	%esi, 4(%esp)
 	movl	$.LC6, (%esp)
 	movl	%eax, 8(%esp)
 	movl	%edx, 12(%esp)
 	call	printf
-	addl	$136, %esp
+	addl	$152, %esp
 	xorl	%eax, %eax
 	popl	%ecx
 	popl	%ebx
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	leal	-4(%ecx), %esp
 	ret
-.L89:
+.L93:
 	fstp	%st(1)
+	jmp	.L74
+.L89:
+	fadds	.LC2
 	jmp	.L72
-.L85:
+.L91:
 	fadds	.LC2
-	jmp	.L70
-.L87:
+	jmp	.L75
+.L90:
 	fadds	.LC2
 	jmp	.L73
-.L86:
-	fadds	.LC2
-	jmp	.L71
 	.size	main, .-main
 	.section	.rodata
 	.align 32
 	.type	cases, @object
 	.size	cases, 2208
 cases:
...


-- 

View attachment "cbrt-test.c" of type "text/plain" (11464 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ