[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49F95E9B.5020005@cosmosbay.com>
Date: Thu, 30 Apr 2009 10:17:31 +0200
From: Eric Dumazet <dada1@...mosbay.com>
To: Andrew Gallatin <gallatin@...i.com>
CC: Herbert Xu <herbert@...dor.apana.org.au>,
David Miller <davem@...emloft.net>, brice@...i.com,
sgruszka@...hat.com, netdev@...r.kernel.org
Subject: Re: [PATCH] myr10ge: again fix lro_gen_skb() alignment
Andrew Gallatin a écrit :
> Eric Dumazet wrote:
>
>>
>> Sure, probably more cache misses or something...
>
> Yes, that's what I thought. The code is much more complete,
> and spread out than LRO, and seems to open itself to cache
> misses.
>
>> You could try a longer oprofile session (with at least one million
> samples)
>> and :
>>
>> opannotate -a vmlinux >/tmp/FILE
>>
>> And select 3 or 4 suspect functions : inet_gro_receive()
> tcp_gro_receive(),
>> skb_gro_receive(), skb_gro_header()
>
> Here is the opreport -l output from this machine for GRO for a 25 minute
> profiling run:
>
>
> samples % image name app name symbol name
> 3742674 32.2793 vmlinux vmlinux copy_user_generic_string
> 890179 7.6775 myri10ge.ko myri10ge myri10ge_poll
> 547572 4.7226 vmlinux vmlinux inet_gro_receive
> 477479 4.1181 vmlinux vmlinux skb_gro_receive
> 406562 3.5065 vmlinux vmlinux free_hot_cold_page
> 396796 3.4222 vmlinux vmlinux tcp_gro_receive
> 332364 2.8665 vmlinux vmlinux __rmqueue_smallest
> 319455 2.7552 vmlinux vmlinux skb_gro_header
> 269040 2.3204 vmlinux vmlinux dev_gro_receive
> 252885 2.1810 vmlinux vmlinux free_pages_bulk
> 247832 2.1375 vmlinux vmlinux
> get_pageblock_flags_group
> 211592 1.8249 myri10ge.ko myri10ge myri10ge_alloc_rx_pages
> 208867 1.8014 vmlinux vmlinux __list_add
> 201491 1.7378 vmlinux vmlinux tcp4_gro_receive
> 187591 1.6179 vmlinux vmlinux __napi_gro_receive
> 170156 1.4675 vmlinux vmlinux get_page_from_freelist
> 116321 1.0032 vmlinux vmlinux
> list_del
> 107994 0.9314 vmlinux vmlinux kfree
> 106434 0.9180 vmlinux vmlinux skb_copy_datagram_iovec
> 100675 0.8683 vmlinux vmlinux
> put_page
>
> And is here is the opannotate -a output for a few GRO functions. BTW,
> did you mean -s
> rather than -a? I'd naively think source might be more helpful. But
> here is
> what you asked for:
>
> ffffffff80479f20 <inet_gro_receive>: /* inet_gro_receive total: 547572
> 5.2554 */
> 12187 0.1170 :ffffffff80479f20: push %r13
> 2611 0.0251 :ffffffff80479f22: mov %rdi,%r13
> :ffffffff80479f25: push %r12
> :ffffffff80479f27: push %rbp
> 4031 0.0387 :ffffffff80479f28: push %rbx
> :ffffffff80479f29: mov %rsi,%rbx
> :ffffffff80479f2c: mov $0x14,%esi
> 6303 0.0605 :ffffffff80479f31: mov %rbx,%rdi
> :ffffffff80479f34: sub $0x8,%rsp
> :ffffffff80479f38: callq ffffffff804357a1
> <skb_gro_header>
> :ffffffff80479f3d: test %rax,%rax
> 2494 0.0239 :ffffffff80479f40: mov %rax,%r8
> :ffffffff80479f43: je ffffffff8047a0a4
> <inet_gro_receive+0x184>
> :ffffffff80479f49: movzbl 0x9(%rax),%eax
> 2541 0.0244 :ffffffff80479f4d: mov
> 0xffffffff80d06280(,%rax,8),%r11
> 33 3.2e-04 :ffffffff80479f55: test %r11,%r11
> 5 4.8e-05 :ffffffff80479f58: je ffffffff8047a0a4
> <inet_gro_receive+0x184>
> 11016 0.1057 :ffffffff80479f5e: cmpq $0x0,0x20(%r11)
> 292 0.0028 :ffffffff80479f63: je ffffffff8047a0a4
> <inet_gro_receive+0x184>
> 1 9.6e-06 :ffffffff80479f69: cmpb $0x45,(%r8)
> 4297 0.0412 :ffffffff80479f6d: jne ffffffff8047a0a4
> <inet_gro_receive+0x184>
> 6086 0.0584 :ffffffff80479f73: mov $0x5,%eax
> :ffffffff80479f78: mov %r8,%rcx
> 18706 0.1795 :ffffffff80479f7b: mov (%rcx),%edx
> 341 0.0033 :ffffffff80479f7d: sub $0x4,%eax
> :ffffffff80479f80: jbe ffffffff80479fa6
> <inet_gro_receive+0x86>
> 4609 0.0442 :ffffffff80479f82: add 0x4(%rcx),%edx
> 398 0.0038 :ffffffff80479f85: adc 0x8(%rcx),%edx
> :ffffffff80479f88: adc 0xc(%rcx),%edx
> 4310 0.0414 :ffffffff80479f8b: adc 0x10(%rcx),%edx
> 790 0.0076 :ffffffff80479f8e: lea 0x4(%rcx),%rcx
> :ffffffff80479f92: dec %eax
> 9097 0.0873 :ffffffff80479f94: jne ffffffff80479f8b
> <inet_gro_receive+0x6b>
> 541 0.0052 :ffffffff80479f96: adc $0x0,%edx
> :ffffffff80479f99: mov %edx,%eax
> 1919 0.0184 :ffffffff80479f9b: shr $0x10,%edx
> 535 0.0051 :ffffffff80479f9e: add %ax,%dx
> :ffffffff80479fa1: adc $0x0,%edx
> 3633 0.0349 :ffffffff80479fa4: not %edx
> 683 0.0066 :ffffffff80479fa6: test %dx,%dx
> 1 9.6e-06 :ffffffff80479fa9: jne ffffffff8047a0a4
> <inet_gro_receive+0x184>
> 4725 0.0453 :ffffffff80479faf: movzwl 0x2(%r8),%eax
> 9728 0.0934 :ffffffff80479fb4: mov 0x68(%rbx),%edx
> 8 7.7e-05 :ffffffff80479fb7: mov $0x1,%ebp
> 43000 0.4127 :ffffffff80479fbc: sub 0x38(%rbx),%edx
> 11149 0.1070 :ffffffff80479fbf: mov %eax,%ecx
> :ffffffff80479fc1: shl $0x8,%eax
> 66497 0.6382 :ffffffff80479fc4: shr $0x8,%ecx
> 735 0.0071 :ffffffff80479fc7: or %ecx,%eax
> :ffffffff80479fc9: movzwl %ax,%eax
> 5459 0.0524 :ffffffff80479fcc: cmp %edx,%eax
> 522 0.0050 :ffffffff80479fce: jne ffffffff80479fdc
> <inet_gro_receive+0xbc>
> :ffffffff80479fd0: xor %ebp,%ebp
> 5373 0.0516 :ffffffff80479fd2: cmpw $0x40,0x6(%r8)
> 345 0.0033 :ffffffff80479fd8: setne %bpl
> :ffffffff80479fdc: movzwl 0x4(%r8),%eax
> 2384 0.0229 :ffffffff80479fe1: mov 0x0(%r13),%r10
> 631 0.0061 :ffffffff80479fe5: mov %eax,%edx
> :ffffffff80479fe7: shl $0x8,%eax
> 3044 0.0292 :ffffffff80479fea: shr $0x8,%edx
> 303 0.0029 :ffffffff80479fed: or %edx,%eax
> :ffffffff80479fef: movzwl %ax,%r12d
> 2747 0.0264 :ffffffff80479ff3: jmp ffffffff8047a071
> <inet_gro_receive+0x151>
> 2109 0.0202 :ffffffff80479ff5: lea 0x38(%r10),%r9
> 12 1.2e-04 :ffffffff80479ff9: cmpl $0x0,0x4(%r9)
> 23 2.2e-04 :ffffffff80479ffe: je ffffffff8047a06e
> <inet_gro_receive+0x14e>
> 2104 0.0202 :ffffffff8047a000: mov 0xac(%r10),%edi
> 2 1.9e-05 :ffffffff8047a007: add 0xc0(%r10),%rdi
> :ffffffff8047a00e: mov 0x9(%rdi),%sil
> 2391 0.0229 :ffffffff8047a012: mov 0x1(%rdi),%al
> 2 1.9e-05 :ffffffff8047a015: xor 0x9(%r8),%sil
> 7 6.7e-05 :ffffffff8047a019: xor 0x1(%r8),%al
> 2101 0.0202 :ffffffff8047a01d: mov 0xc(%rdi),%edx
> 1 9.6e-06 :ffffffff8047a020: mov 0x10(%rdi),%ecx
> :ffffffff8047a023: xor 0xc(%r8),%edx
> 2775 0.0266 :ffffffff8047a027: xor 0x10(%r8),%ecx
> :ffffffff8047a02b: or %esi,%eax
> :ffffffff8047a02d: movzbl %al,%eax
> 62734 0.6021 :ffffffff8047a030: or %edx,%ecx
> :ffffffff8047a032: or %eax,%ecx
> :ffffffff8047a034: je ffffffff8047a040
> <inet_gro_receive+0x120>
> :ffffffff8047a036: movl $0x0,0x4(%r9)
> :ffffffff8047a03e: jmp ffffffff8047a06e
> <inet_gro_receive+0x14e>
> 2106 0.0202 :ffffffff8047a040: movzwl 0x4(%rdi),%edx
> :ffffffff8047a044: mov 0x8(%rdi),%al
> :ffffffff8047a047: xor 0x8(%r8),%eax
> 64244 0.6166 :ffffffff8047a04b: mov %edx,%ecx
> :ffffffff8047a04d: shl $0x8,%edx
> :ffffffff8047a050: shr $0x8,%ecx
> 2072 0.0199 :ffffffff8047a053: movzbl %al,%eax
> :ffffffff8047a056: or 0x8(%r9),%eax
> :ffffffff8047a05a: or %ecx,%edx
> 2629 0.0252 :ffffffff8047a05c: add 0xc(%r9),%edx
> 2 1.9e-05 :ffffffff8047a060: movzwl %dx,%edx
> :ffffffff8047a063: xor %r12d,%edx
> 58223 0.5588 :ffffffff8047a066: or %edx,%eax
> 3 2.9e-05 :ffffffff8047a068: or %ebp,%eax
> :ffffffff8047a06a: mov %eax,0x8(%r9)
> 21878 0.2100 :ffffffff8047a06e: mov (%r10),%r10
> 2156 0.0207 :ffffffff8047a071: test %r10,%r10
> :ffffffff8047a074: jne ffffffff80479ff5
> <inet_gro_receive+0xd5>
> 3007 0.0289 :ffffffff8047a07a: mov 0x38(%rbx),%eax
> 61 5.9e-04 :ffffffff8047a07d: or %ebp,0x40(%rbx)
> 3 2.9e-05 :ffffffff8047a080: mov %rbx,%rsi
> 3091 0.0297 :ffffffff8047a083: mov %r13,%rdi
> 41 3.9e-04 :ffffffff8047a086: add $0x14,%eax
> :ffffffff8047a089: mov %eax,0x38(%rbx)
> 3704 0.0355 :ffffffff8047a08c: sub 0xc0(%rbx),%eax
> 33 3.2e-04 :ffffffff8047a092: add 0xc8(%rbx),%eax
> :ffffffff8047a098: mov %eax,0xa8(%rbx)
> 2468 0.0237 :ffffffff8047a09e: callq *0x20(%r11)
> 20011 0.1921 :ffffffff8047a0a2: jmp ffffffff8047a0ab
> <inet_gro_receive+0x18b>
> :ffffffff8047a0a4: xor %eax,%eax
> :ffffffff8047a0a6: mov $0x1,%ebp
> 24082 0.2311 :ffffffff8047a0ab: or %ebp,0x40(%rbx)
> 626 0.0060 :ffffffff8047a0ae: pop %r10
> 1718 0.0165 :ffffffff8047a0b0: pop %rbx
> 446 0.0043 :ffffffff8047a0b1: pop %rbp
> 4074 0.0391 :ffffffff8047a0b2: pop %r12
> 2089 0.0200 :ffffffff8047a0b4: pop %r13
> 434 0.0042 :ffffffff8047a0b6: retq
>
>
>
>
> ffffffff80430ea9 <skb_gro_receive>: /* skb_gro_receive total: 477479
> 4.5827 */
> 2158 0.0207 :ffffffff80430ea9: push %r15
> 2492 0.0239 :ffffffff80430eab: mov %rdi,%r15
> :ffffffff80430eae: push %r14
> :ffffffff80430eb0: push %r13
> 2432 0.0233 :ffffffff80430eb2: push %r12
> 1 9.6e-06 :ffffffff80430eb4: push %rbp
> 1 9.6e-06 :ffffffff80430eb5: mov %rsi,%rbp
> 2430 0.0233 :ffffffff80430eb8: push %rbx
> :ffffffff80430eb9: sub $0x8,%rsp
> :ffffffff80430ebd: mov 0x68(%rsi),%ecx
> 2420 0.0232 :ffffffff80430ec0: mov (%rdi),%r12
> 1 9.6e-06 :ffffffff80430ec3: mov %ecx,%r14d
> 1 9.6e-06 :ffffffff80430ec6: sub 0x38(%rsi),%r14d
> 2317 0.0222 :ffffffff80430eca: mov %r14d,%eax
> 1 9.6e-06 :ffffffff80430ecd: add 0x68(%r12),%eax
> 1 9.6e-06 :ffffffff80430ed2: cmp $0xffff,%eax
> 3865 0.0371 :ffffffff80430ed7: ja ffffffff80431261
> <skb_gro_receive+0x3b8>
> :ffffffff80430edd: mov 0xb8(%r12),%eax
> :ffffffff80430ee5: mov 0xc0(%r12),%rdx
> 8082 0.0776 :ffffffff80430eed: lea (%rdx,%rax,1),%rsi
> :ffffffff80430ef1: cmpq $0x0,0x18(%rsi)
> 2 1.9e-05 :ffffffff80430ef6: jne ffffffff804311ab
> <skb_gro_receive+0x302>
> 9249 0.0888 :ffffffff80430efc: mov %ecx,%edi
> :ffffffff80430efe: sub 0x6c(%rbp),%edi
> 6 5.8e-05 :ffffffff80430f01: cmp 0x38(%rbp),%edi
> 3104 0.0298 :ffffffff80430f04: ja ffffffff80430fe2
> <skb_gro_receive+0x139>
> 2 1.9e-05 :ffffffff80430f0a: mov 0xb8(%rbp),%ecx
> :ffffffff80430f10: movzwl 0x4(%rsi),%edx
> 8825 0.0847 :ffffffff80430f14: add 0xc0(%rbp),%rcx
> :ffffffff80430f1b: movzwl 0x4(%rcx),%eax
> 21 2.0e-04 :ffffffff80430f1f: add %edx,%eax
> 19668 0.1888 :ffffffff80430f21: cmp $0x12,%eax
> 1 9.6e-06 :ffffffff80430f24: ja ffffffff80431261
> <skb_gro_receive+0x3b8>
> :ffffffff80430f2a: mov 0x38(%rcx),%eax
> 1974 0.0189 :ffffffff80430f2d: add 0x38(%rbp),%eax
> :ffffffff80430f30: cld
> :ffffffff80430f31: sub %edi,%eax
> 7666 0.0736 :ffffffff80430f33: mov %eax,0x38(%rcx)
> 2 1.9e-05 :ffffffff80430f36: mov 0xb8(%rbp),%edx
> :ffffffff80430f3c: add 0xc0(%rbp),%rdx
Compiler has hard time to optimize these function apparently... :(
skb_shinfo(skb) & skb_shinfo(p) are evaluated many times.
> 52468 0.5036 :ffffffff80430f43: mov 0x3c(%rdx),%eax
> 2 1.9e-05 :ffffffff80430f46: add 0x68(%rbp),%eax
> 1 9.6e-06 :ffffffff80430f49: sub 0x6c(%rbp),%eax
> 6592 0.0633 :ffffffff80430f4c: sub 0x38(%rbp),%eax
> :ffffffff80430f4f: mov %eax,0x3c(%rdx)
> :ffffffff80430f52: mov 0xb8(%r12),%eax
> 23018 0.2209 :ffffffff80430f5a: add 0xc0(%r12),%rax
> 1 9.6e-06 :ffffffff80430f62: mov 0xb8(%rbp),%esi
> :ffffffff80430f68: add 0xc0(%rbp),%rsi
> 8477 0.0814 :ffffffff80430f6f: movzwl 0x4(%rax),%edi
> 6 5.8e-05 :ffffffff80430f73: movzwl 0x4(%rsi),%ecx
> :ffffffff80430f77: add $0x30,%rsi
> 21338 0.2048 :ffffffff80430f7b: shl $0x4,%rdi
> 3 2.9e-05 :ffffffff80430f7f: lea 0x30(%rdi,%rax,1),%rdi
> 1 9.6e-06 :ffffffff80430f84: shl $0x4,%rcx
> 150632 1.4457 :ffffffff80430f88: rep movsb %ds:(%rsi),%es:(%rdi)
ouch... What stupid compiler... should use movsq here :(
we could try to inline the likely case of one fragment copied...
> 3988 0.0383 :ffffffff80430f8a: mov 0xb8(%r12),%eax
> 2015 0.0193 :ffffffff80430f92: mov 0xb8(%rbp),%ecx
> 11 1.1e-04 :ffffffff80430f98: add 0xc0(%r12),%rax
> 8 7.7e-05 :ffffffff80430fa0: mov 0xc0(%rbp),%rdx
> 3295 0.0316 :ffffffff80430fa7: mov 0x4(%rdx,%rcx,1),%edx
> :ffffffff80430fab: add %dx,0x4(%rax)
> 8 7.7e-05 :ffffffff80430faf: mov 0xb8(%rbp),%edx
> 2507 0.0241 :ffffffff80430fb5: mov 0xc0(%rbp),%rax
> :ffffffff80430fbc: movw $0x0,0x4(%rax,%rdx,1)
> 3233 0.0310 :ffffffff80430fc3: mov 0x6c(%rbp),%eax
> 1 9.6e-06 :ffffffff80430fc6: sub %eax,0xd0(%rbp)
> :ffffffff80430fcc: sub %eax,0x68(%rbp)
> 41540 0.3987 :ffffffff80430fcf: movl $0x0,0x6c(%rbp)
> :ffffffff80430fd6: movl $0x1,0x48(%rbp)
> :ffffffff80430fdd: jmpq ffffffff8043123f
> <skb_gro_receive+0x396>
> :ffffffff80430fe2: mov 0xc8(%r12),%rax
> :ffffffff80430fea: mov 0x20(%r12),%rdi
> :ffffffff80430fef: mov %eax,%r13d
> :ffffffff80430ff2: sub %edx,%r13d
> :ffffffff80430ff5: mov $0x20,%edx
> :ffffffff80430ffa: mov %r13d,%esi
> :ffffffff80430ffd: add 0x38(%r12),%esi
> :ffffffff80431002: callq ffffffff8042ffe0
...
> :ffffffff80431223: ud2a
> :ffffffff80431225: jmp ffffffff80431225
> <skb_gro_receive+0x37c>
> :ffffffff80431227: mov 0xb8(%rbp),%eax
> :ffffffff8043122d: orb $0x10,0x7c(%rbp)
> :ffffffff80431231: add 0xc0(%rbp),%rax
> :ffffffff80431238: lock addl $0x10000,(%rax)
> 34919 0.3351 :ffffffff8043123f: add %r14d,0x6c(%r12)
> 1989 0.0191 :ffffffff80431244: add %r14d,0xd0(%r12)
> 1 9.6e-06 :ffffffff8043124c: xor %eax,%eax
> :ffffffff8043124e: add %r14d,0x68(%r12)
> 20605 0.1978 :ffffffff80431253: incl 0x44(%r12)
> :ffffffff80431258: movl $0x1,0x3c(%rbp)
> :ffffffff8043125f: jmp ffffffff80431266
> <skb_gro_receive+0x3bd>
> :ffffffff80431261: mov $0xfffffff9,%eax
> 13260 0.1273 :ffffffff80431266: pop %r11
> 1946 0.0187 :ffffffff80431268: pop %rbx
> 2010 0.0193 :ffffffff80431269: pop %rbp
> 64 6.1e-04 :ffffffff8043126a: pop %r12
> 1948 0.0187 :ffffffff8043126c: pop %r13
> 2746 0.0264 :ffffffff8043126e: pop %r14
> 57 5.5e-04 :ffffffff80431270: pop %r15
> 2067 0.0198 :ffffffff80431272: retq
>
> ffffffff80460663 <tcp_gro_receive>: /* tcp_gro_receive total: 396796
> 3.8083 */
> 4433 0.0425 :ffffffff80460663: push %r15
> 2204 0.0212 :ffffffff80460665: push %r14
> :ffffffff80460667: mov %rdi,%r14
> :ffffffff8046066a: push %r13
> 2275 0.0218 :ffffffff8046066c: push %r12
> :ffffffff8046066e: mov %rsi,%r12
> :ffffffff80460671: mov $0x14,%esi
> 5933 0.0569 :ffffffff80460676: mov %r12,%rdi
> :ffffffff80460679: push %rbp
> :ffffffff8046067a: push %rbx
> 2180 0.0209 :ffffffff8046067b: sub $0x8,%rsp
> :ffffffff8046067f: callq ffffffff804357a1
> <skb_gro_header>
> :ffffffff80460684: test %rax,%rax
> 3218 0.0309 :ffffffff80460687: je ffffffff804607ed
> <tcp_gro_receive+0x18a>
> :ffffffff8046068d: mov 0xc(%rax),%al
> 1 9.6e-06 :ffffffff80460690: shr $0x4,%al
> 3528 0.0339 :ffffffff80460693: movzbl %al,%eax
> :ffffffff80460696: lea 0x0(,%rax,4),%r13d
> 1 9.6e-06 :ffffffff8046069e: cmp $0x13,%r13d
> 2773 0.0266 :ffffffff804606a2: jbe ffffffff804607ed
> <tcp_gro_receive+0x18a>
> :ffffffff804606a8: mov %r13d,%esi
> :ffffffff804606ab: mov %r12,%rdi
> 3327 0.0319 :ffffffff804606ae: callq ffffffff804357a1
> <skb_gro_header>
> :ffffffff804606b3: test %rax,%rax
> 2094 0.0201 :ffffffff804606b6: mov %rax,%r8
> :ffffffff804606b9: je ffffffff804607ed
> <tcp_gro_receive+0x18a>
> :ffffffff804606bf: lea 0x38(%r12),%r15
> 2245 0.0215 :ffffffff804606c4: add %r13d,(%r15)
> :ffffffff804606c7: mov 0x68(%r12),%ebp
> :ffffffff804606cc: sub 0x38(%r12),%ebp
> 2394 0.0230 :ffffffff804606d1: mov 0xc(%rax),%ebx
> :ffffffff804606d4: jmp ffffffff80460710
> <tcp_gro_receive+0xad>
> 2111 0.0203 :ffffffff804606d6: lea 0x38(%rdi),%r9
> 3 2.9e-05 :ffffffff804606da: cmpl $0x0,0x4(%r9)
> 21 2.0e-04 :ffffffff804606df: je ffffffff8046070d
> <tcp_gro_receive+0xaa>
> 2592 0.0249 :ffffffff804606e1: mov 0xa8(%rdi),%eax
> :ffffffff804606e7: mov 0xc0(%rdi),%r10
> :ffffffff804606ee: mov 0x2(%r8),%dx
> 2440 0.0234 :ffffffff804606f3: lea (%r10,%rax,1),%rcx
> :ffffffff804606f7: mov (%r8),%eax
> 1 9.6e-06 :ffffffff804606fa: xor 0x2(%rcx),%dx
> 6275 0.0602 :ffffffff804606fe: xor (%rcx),%eax
> 3 2.9e-05 :ffffffff80460700: or %ax,%dx
> :ffffffff80460703: je ffffffff8046071d
> <tcp_gro_receive+0xba>
> :ffffffff80460705: movl $0x0,0x4(%r9)
> :ffffffff8046070d: mov %rdi,%r14
> 2920 0.0280 :ffffffff80460710: mov (%r14),%rdi
> 18 1.7e-04 :ffffffff80460713: test %rdi,%rdi
> 2 1.9e-05 :ffffffff80460716: jne ffffffff804606d6
> <tcp_gro_receive+0x73>
> 33 3.2e-04 :ffffffff80460718: jmpq ffffffff80460807
> <tcp_gro_receive+0x1a4>
> 4253 0.0408 :ffffffff8046071d: mov 0xe(%r8),%ax
> 2125 0.0204 :ffffffff80460722: xor 0xe(%rcx),%ax
> 2 1.9e-05 :ffffffff80460726: mov %ebx,%edx
> :ffffffff80460728: and $0x8000,%edx
> 8066 0.0774 :ffffffff8046072e: or 0x8(%r9),%edx
> :ffffffff80460732: movzwl %ax,%esi
> :ffffffff80460735: mov 0x8(%r8),%eax
> 64740 0.6214 :ffffffff80460739: xor 0x8(%rcx),%eax
> :ffffffff8046073c: or %eax,%esi
> :ffffffff8046073e: mov %ebx,%eax
> 2084 0.0200 :ffffffff80460740: xor 0xc(%rcx),%eax
> :ffffffff80460743: and $0x76,%ah
> :ffffffff80460746: or %eax,%edx
> 2132 0.0205 :ffffffff80460748: or %edx,%esi
> :ffffffff8046074a: mov $0x14,%edx
> :ffffffff8046074f: jmp ffffffff8046075e
> <tcp_gro_receive+0xfb>
> :ffffffff80460751: movslq %edx,%rax
> :ffffffff80460754: add $0x4,%edx
> :ffffffff80460757: mov (%r8,%rax,1),%esi
> :ffffffff8046075b: xor (%rcx,%rax,1),%esi
> 3670 0.0352 :ffffffff8046075e: test %esi,%esi
> 2162 0.0208 :ffffffff80460760: jne ffffffff80460767
> <tcp_gro_receive+0x104>
> :ffffffff80460762: cmp %r13d,%edx
> 1 9.6e-06 :ffffffff80460765: jb ffffffff80460751
> <tcp_gro_receive+0xee>
> 50209 0.4819 :ffffffff80460767: mov 0xb8(%rdi),%eax
> 4473 0.0429 :ffffffff8046076d: mov 0x4(%rcx),%edx
> :ffffffff80460770: bswap %edx
> 9554 0.0917 :ffffffff80460772: mov 0x4(%r8),%ecx
> :ffffffff80460776: bswap %ecx
> :ffffffff80460778: movzwl 0x6(%r10,%rax,1),%r13d
> 7572 0.0727 :ffffffff8046077e: mov 0x68(%rdi),%eax
> :ffffffff80460781: sub 0x38(%rdi),%eax
> :ffffffff80460784: add %edx,%eax
> 9803 0.0941 :ffffffff80460786: xor %eax,%ecx
> :ffffffff80460788: cmp %r13d,%ebp
> :ffffffff8046078b: seta %al
> 50608 0.4857 :ffffffff8046078e: test %ebp,%ebp
> :ffffffff80460790: sete %dl
> :ffffffff80460793: or %edx,%eax
> 3161 0.0303 :ffffffff80460795: movzbl %al,%eax
> :ffffffff80460798: or %eax,%esi
> :ffffffff8046079a: or %esi,%ecx
> 3278 0.0315 :ffffffff8046079c: jne ffffffff804607f6
> <tcp_gro_receive+0x193>
> :ffffffff8046079e: mov %r12,%rsi
> 2 1.9e-05 :ffffffff804607a1: mov %r14,%rdi
> 2579 0.0248 :ffffffff804607a4: callq ffffffff80430ea9
> <skb_gro_receive>
> 2059 0.0198 :ffffffff804607a9: test %eax,%eax
> 49 4.7e-04 :ffffffff804607ab: jne ffffffff804607f6
> <tcp_gro_receive+0x193>
> :ffffffff804607ad: mov (%r14),%rcx
> 1945 0.0187 :ffffffff804607b0: mov %ebx,%edx
> 3 2.9e-05 :ffffffff804607b2: and $0x900,%edx
> :ffffffff804607b8: mov 0xa8(%rcx),%eax
> 2530 0.0243 :ffffffff804607be: add 0xc0(%rcx),%rax
> 3 2.9e-05 :ffffffff804607c5: or %edx,0xc(%rax)
> 13 1.2e-04 :ffffffff804607c8: xor %eax,%eax
> 4881 0.0468 :ffffffff804607ca: cmp %r13d,%ebp
> :ffffffff804607cd: setb %al
> :ffffffff804607d0: and $0x2f00,%ebx
> 1912 0.0184 :ffffffff804607d6: or %ebx,%eax
> :ffffffff804607d8: test %rcx,%rcx
> :ffffffff804607db: je ffffffff80460816
> <tcp_gro_receive+0x1b3>
> 2163 0.0208 :ffffffff804607dd: cmpl $0x0,0x4(%r15)
> 136 0.0013 :ffffffff804607e2: je ffffffff804607e8
> <tcp_gro_receive+0x185>
> 2455 0.0236 :ffffffff804607e4: test %eax,%eax
> 57 5.5e-04 :ffffffff804607e6: je ffffffff80460816
> <tcp_gro_receive+0x1b3>
> 148 0.0014 :ffffffff804607e8: mov %r14,%rdi
> 735 0.0071 :ffffffff804607eb: jmp ffffffff80460818
> <tcp_gro_receive+0x1b5>
> :ffffffff804607ed: xor %edi,%edi
> :ffffffff804607ef: mov $0x1,%eax
> :ffffffff804607f4: jmp ffffffff80460818
> <tcp_gro_receive+0x1b5>
> 68 6.5e-04 :ffffffff804607f6: xor %eax,%eax
> 1 9.6e-06 :ffffffff804607f8: test %ebp,%ebp
> 67 6.4e-04 :ffffffff804607fa: sete %al
> 47 4.5e-04 :ffffffff804607fd: and $0x2f00,%ebx
> :ffffffff80460803: or %ebx,%eax
> 58 5.6e-04 :ffffffff80460805: jmp ffffffff804607dd
> <tcp_gro_receive+0x17a>
> 122 0.0012 :ffffffff80460807: xor %eax,%eax
> 9 8.6e-05 :ffffffff80460809: test %ebp,%ebp
> :ffffffff8046080b: sete %al
> 67 6.4e-04 :ffffffff8046080e: and $0x2f00,%ebx
> 6 5.8e-05 :ffffffff80460814: or %ebx,%eax
> 1995 0.0191 :ffffffff80460816: xor %edi,%edi
> 68 6.5e-04 :ffffffff80460818: or %eax,0x40(%r12)
> 275 0.0026 :ffffffff8046081d: mov %rdi,%rax
> 2037 0.0196 :ffffffff80460820: pop %r11
> 191 0.0018 :ffffffff80460822: pop %rbx
> 4346 0.0417 :ffffffff80460823: pop %rbp
> 4739 0.0455 :ffffffff80460824: pop %r12
> 167 0.0016 :ffffffff80460826: pop %r13
> 23735 0.2278 :ffffffff80460828: pop %r14
> 56070 0.5381 :ffffffff8046082a: pop %r15
> 140 0.0013 :ffffffff8046082c: retq
>
> ffffffff804357a1 <skb_gro_header>: /* skb_gro_header total: 319455
> 3.0660 */
> 13604 0.1306 :ffffffff804357a1: push %rbp
> 14938 0.1434 :ffffffff804357a2: push %rbx
> :ffffffff804357a3: mov %rdi,%rbx
> :ffffffff804357a6: sub $0x8,%rsp
> 18392 0.1765 :ffffffff804357aa: mov 0x38(%rdi),%ebp
> :ffffffff804357ad: mov 0x68(%rdi),%edx
> 1 9.6e-06 :ffffffff804357b0: add %ebp,%esi
> 20559 0.1973 :ffffffff804357b2: mov %edx,%edi
> :ffffffff804357b4: sub 0x6c(%rbx),%edi
> :ffffffff804357b7: jne ffffffff804357cc
> <skb_gro_header+0x2b>
> 36626 0.3515 :ffffffff804357b9: mov 0xb8(%rbx),%ecx
> 2 1.9e-05 :ffffffff804357bf: mov 0xc0(%rbx),%rax
> 3 2.9e-05 :ffffffff804357c6: cmp %esi,0x3c(%rax,%rcx,1)
> 18577 0.1783 :ffffffff804357ca: jae ffffffff804357ee
> <skb_gro_header+0x4d>
> :ffffffff804357cc: cmp %edi,%esi
> :ffffffff804357ce: jbe ffffffff804357e3
> <skb_gro_header+0x42>
> :ffffffff804357d0: cmp %edx,%esi
> :ffffffff804357d2: ja ffffffff80435833
> <skb_gro_header+0x92>
> :ffffffff804357d4: sub %edi,%esi
> :ffffffff804357d6: mov %rbx,%rdi
> :ffffffff804357d9: callq ffffffff8042f6ee
> <__pskb_pull_tail>
> :ffffffff804357de: test %rax,%rax
> :ffffffff804357e1: je ffffffff80435833
> <skb_gro_header+0x92>
> :ffffffff804357e3: mov %ebp,%eax
> :ffffffff804357e5: add 0xc8(%rbx),%rax
> :ffffffff804357ec: jmp ffffffff80435835
> <skb_gro_header+0x94>
> 3 2.9e-05 :ffffffff804357ee: add 0xc0(%rbx),%rcx
> 25999 0.2495 :ffffffff804357f5: mov $0x1e0000000000,%rax
> :ffffffff804357ff: mov $0x6db6db6db6db6db7,%rdx
OK, sizeof(struct page) is 0x38, we know it hurts some workloads.
It would be better to waste few bytes but to align them on cache lines here.
> 44557 0.4276 :ffffffff80435809: add 0x30(%rcx),%rax
> :ffffffff8043580d: sar $0x3,%rax
> 12588 0.1208 :ffffffff80435811: imul %rdx,%rax
> 10104 0.0970 :ffffffff80435815: mov $0xffff880000000000,%rdx
> :ffffffff8043581f: shl $0xc,%rax
> :ffffffff80435823: add %rdx,%rax
> 16404 0.1574 :ffffffff80435826: mov 0x38(%rcx),%edx
> :ffffffff80435829: add %rdx,%rax
> :ffffffff8043582c: mov %ebp,%edx
> 15264 0.1465 :ffffffff8043582e: add %rdx,%rax
> :ffffffff80435831: jmp ffffffff80435835
> <skb_gro_header+0x94>
> :ffffffff80435833: xor %eax,%eax
> 45844 0.4400 :ffffffff80435835: pop %r10
> 2 1.9e-05 :ffffffff80435837: pop %rbx
> 12844 0.1233 :ffffffff80435838: pop %rbp
> 13144 0.1262 :ffffffff80435839: retq
>
I wonder if you could try to enlarge 'struct page' by 8 bytes and redo a test...
Here is a patch to combine two ideas. But it wont allow GRO to go much faster I guess :(
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26..44e97e2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -98,6 +98,7 @@ struct page {
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags; /* Use atomic bitops on this */
#endif
+ unsigned long _pad; /* so that sizeof(struct page) is 64 bytes */
};
/*
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce6356c..74a6900 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2660,28 +2660,37 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct sk_buff *nskb;
unsigned int headroom;
unsigned int len = skb_gro_len(skb);
+ int delta;
+ struct skb_shared_info *skb_shinfo_p = skb_shinfo(p);
if (p->len + len >= 65536)
return -E2BIG;
- if (skb_shinfo(p)->frag_list)
+ delta = skb_gro_offset(skb) - skb_headlen(skb);
+ if (skb_shinfo_p->frag_list)
goto merge;
- else if (skb_headlen(skb) <= skb_gro_offset(skb)) {
- if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags >
+ if (delta >= 0) {
+ struct skb_shared_info *skb_shinfo_skb = skb_shinfo(skb);
+
+ if (skb_shinfo_p->nr_frags + skb_shinfo_skb->nr_frags >
MAX_SKB_FRAGS)
return -E2BIG;
- skb_shinfo(skb)->frags[0].page_offset +=
- skb_gro_offset(skb) - skb_headlen(skb);
- skb_shinfo(skb)->frags[0].size -=
- skb_gro_offset(skb) - skb_headlen(skb);
-
- memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
- skb_shinfo(skb)->frags,
- skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo_skb->frags[0].page_offset += delta;
+ skb_shinfo_skb->frags[0].size -= delta;
- skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
- skb_shinfo(skb)->nr_frags = 0;
+ if (likely(skb_shinfo_skb->nr_frags == 1)) {
+ memcpy(skb_shinfo_p->frags + skb_shinfo_p->nr_frags,
+ skb_shinfo_skb->frags,
+ sizeof(skb_frag_t));
+ skb_shinfo_p->nr_frags += 1;
+ } else {
+ memcpy(skb_shinfo_p->frags + skb_shinfo_p->nr_frags,
+ skb_shinfo_skb->frags,
+ skb_shinfo_skb->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo_p->nr_frags += skb_shinfo_skb->nr_frags;
+ }
+ skb_shinfo_skb->nr_frags = 0;
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
@@ -2726,12 +2735,11 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
p = nskb;
+ delta = skb_gro_offset(skb) - skb_headlen(skb);
merge:
- if (skb_gro_offset(skb) > skb_headlen(skb)) {
- skb_shinfo(skb)->frags[0].page_offset +=
- skb_gro_offset(skb) - skb_headlen(skb);
- skb_shinfo(skb)->frags[0].size -=
- skb_gro_offset(skb) - skb_headlen(skb);
+ if (delta > 0) {
+ skb_shinfo(skb)->frags[0].page_offset += delta;
+ skb_shinfo(skb)->frags[0].size -= delta;
skb_gro_reset_offset(skb);
skb_gro_pull(skb, skb_headlen(skb));
}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists