[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20070307183256.pkas62fucy2vugqg@m.safari.iki.fi>
Date: Wed, 7 Mar 2007 20:32:56 +0200
From: Sami Farin <7atbggg02@...akemail.com>
To: linux-kernel@...r.kernel.org, netdev@...r.kernel.org
Subject: Re: [RFC] div64_64 support
On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
> Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> >> when doing 1000 loops test... gcc-4.0.3 works.
> >
> > Found it.
> >
> > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >
> > __asm__("bsrl %1,%0\n\t"
> > "cmovzl %2,%0"
> > - : "=&r" (r) : "rm" (x), "rm" (-1));
> > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> > return r+1;
> > }
> >
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
>
> Can you post the difference in the generated code with that change?
Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times. So it gets better timings ( 42/666=0 ) =)
--- cbrt-test-no_memory.s 2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s 2007-03-07 20:22:38.237013197 +0200
...
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
- subl $136, %esp
+ subl $152, %esp
movl $.LC0, (%esp)
call puts
xorl %edx, %edx
movl $27, %eax
call ncubic
cmpl $3, %eax
- je .L83
+ je .L87
movl $.LC1, (%esp)
call puts
-.L83:
- xorl %eax, %eax
- xorl %edi, %edi
- movl %eax, 88(%esp)
+.L87:
xorl %eax, %eax
- xorl %esi, %esi
+ xorl %ebp, %ebp
movl %eax, 92(%esp)
xorl %eax, %eax
- xorl %ebp, %ebp
+ xorl %edi, %edi
movl %eax, 96(%esp)
xorl %eax, %eax
+ xorl %esi, %esi
movl %eax, 100(%esp)
xorl %eax, %eax
movl %eax, 104(%esp)
xorl %eax, %eax
movl %eax, 108(%esp)
- movl %edi, 112(%esp)
- movl %esi, 116(%esp)
- .p2align 4,,15
-.L84:
+ xorl %eax, %eax
+ movl %eax, 112(%esp)
+ movl %ebp, 116(%esp)
+ movl %edi, 120(%esp)
+ movl %esi, 124(%esp)
+.L88:
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
movl %eax, 56(%esp)
movl %edx, 60(%esp)
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
movl %eax, %esi
movl %edx, %edi
subl 56(%esp), %esi
sbbl 60(%esp), %edi
cmpl $0, %edi
ja .L66
cmpl $999, %esi
- jbe .L84
+ jbe .L88
.L66:
+ movl 92(%esp), %edx
+ leal (%edx,%edx,2), %eax
+ movl cases+4(,%eax,4), %edi
+ movl cases(,%eax,4), %esi
+ movl %edi, %edx
+ movl %esi, %eax
+ call ncubic
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
- movl %eax, %esi
- movl %edx, %edi
+ movl $666, %ebx
+ movl %eax, 128(%esp)
+ movl %edx, 132(%esp)
+ .p2align 4,,15
+.L67:
+ movl %esi, %eax
+ movl %edi, %edx
+ call ncubic
+ decl %ebx
+ movl %eax, %ebp
+ jne .L67
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
- subl %esi, %eax
+ subl 128(%esp), %eax
movl $666, %ebx
- sbbl %edi, %edx
- xorl %ecx, %ecx
movl %ebx, 8(%esp)
+ sbbl 132(%esp), %edx
+ xorl %ecx, %ecx
movl %ecx, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call __udivdi3
- addl %eax, 104(%esp)
+ addl %eax, 112(%esp)
movl %edx, %ecx
movl %eax, %ebx
movl %edx, %esi
- adcl %edx, 108(%esp)
+ adcl %edx, 116(%esp)
imull %eax, %ecx
mull %ebx
addl %ecx, %ecx
movl %eax, 56(%esp)
addl %ecx, %edx
movl 56(%esp), %eax
- addl %eax, 112(%esp)
+ addl %eax, 120(%esp)
movl %edx, 60(%esp)
movl 60(%esp), %edx
- adcl %edx, 116(%esp)
- cmpl %esi, 92(%esp)
- ja .L67
- jb .L68
- cmpl %ebx, 88(%esp)
- jae .L67
-.L68:
- movl %ebx, 88(%esp)
- movl %esi, 92(%esp)
-.L67:
- leal (%ebp,%ebp,2), %ebx
- sall $2, %ebx
- movl cases+4(%ebx), %edx
- movl cases(%ebx), %eax
- call ncubic
- movl cases+8(%ebx), %edx
- subl %eax, %edx
- movl %edx, %eax
- sarl $31, %eax
- xorl %eax, %edx
- subl %eax, %edx
- movl %edx, %ecx
- sarl $31, %ecx
- addl %edx, 96(%esp)
- adcl %ecx, 100(%esp)
- incl %ebp
- cmpl $183, %ebp
- jbe .L84
- movl 108(%esp), %eax
- fildll 104(%esp)
- testl %eax, %eax
- js .L85
+ adcl %edx, 124(%esp)
+ cmpl %esi, 100(%esp)
+ ja .L69
+ jb .L70
+ cmpl %ebx, 96(%esp)
+ jae .L69
.L70:
- fstpl 120(%esp)
+ movl %ebx, 96(%esp)
+ movl %esi, 100(%esp)
+.L69:
+ movl 92(%esp), %edx
+ leal (%edx,%edx,2), %eax
+ movl cases+8(,%eax,4), %eax
+ subl %ebp, %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ xorl %ecx, %eax
+ subl %ecx, %eax
+ cltd
+ addl %eax, 104(%esp)
+ adcl %edx, 108(%esp)
+ incl 92(%esp)
+ cmpl $183, 92(%esp)
+ jbe .L88
movl 116(%esp), %eax
- fldl 120(%esp)
+ fildll 112(%esp)
+ testl %eax, %eax
+ js .L89
+.L72:
+ fstpl 136(%esp)
+ movl 124(%esp), %eax
+ fldl 136(%esp)
fdivl .LC7
testl %eax, %eax
flds .LC4
fdivr %st, %st(1)
- fildll 112(%esp)
- js .L86
-.L71:
- fstpl 120(%esp)
- fldl 120(%esp)
+ fildll 120(%esp)
+ js .L90
+.L73:
+ fstpl 136(%esp)
+ fldl 136(%esp)
fdivl .LC7
fdivp %st, %st(1)
fld %st(1)
fmul %st(2), %st
fsubrp %st, %st(1)
fld %st(0)
fsqrt
fucomi %st(0), %st
- jp .L88
- je .L89
-.L88:
+ jp .L92
+ je .L93
+.L92:
fstp %st(0)
fstpl (%esp)
fstpl 64(%esp)
call sqrt
fldl 64(%esp)
fxch %st(1)
-.L72:
- movl 96(%esp), %eax
- movl 100(%esp), %edx
- fildll 88(%esp)
+.L74:
+ movl 104(%esp), %eax
+ movl 108(%esp), %edx
+ fildll 96(%esp)
movl %eax, 40(%esp)
- movl 92(%esp), %eax
+ movl 100(%esp), %eax
movl %edx, 44(%esp)
testl %eax, %eax
- js .L87
-.L73:
- fstpl 120(%esp)
- movl 104(%esp), %eax
+ js .L91
+.L75:
+ fstpl 136(%esp)
+ movl 112(%esp), %eax
movl $184, %ebp
- fldl 120(%esp)
+ fldl 136(%esp)
xorl %edi, %edi
movl $.LC5, %esi
fdivl .LC7
- movl 108(%esp), %edx
+ movl 116(%esp), %edx
movl %ebp, 8(%esp)
movl %edi, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
fstpl 32(%esp)
fstpl 24(%esp)
fstpl 16(%esp)
call __udivdi3
movl %esi, 4(%esp)
movl $.LC6, (%esp)
movl %eax, 8(%esp)
movl %edx, 12(%esp)
call printf
- addl $136, %esp
+ addl $152, %esp
xorl %eax, %eax
popl %ecx
popl %ebx
popl %esi
popl %edi
popl %ebp
leal -4(%ecx), %esp
ret
-.L89:
+.L93:
fstp %st(1)
+ jmp .L74
+.L89:
+ fadds .LC2
jmp .L72
-.L85:
+.L91:
fadds .LC2
- jmp .L70
-.L87:
+ jmp .L75
+.L90:
fadds .LC2
jmp .L73
-.L86:
- fadds .LC2
- jmp .L71
.size main, .-main
.section .rodata
.align 32
.type cases, @object
.size cases, 2208
cases:
...
--
View attachment "cbrt-test.c" of type "text/plain" (11464 bytes)
Powered by blists - more mailing lists