[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZrKL2youCTmO3K0Q@tassilo>
Date: Tue, 6 Aug 2024 13:47:23 -0700
From: Andi Kleen <ak@...ux.intel.com>
To: Mateusz Guzik <mjguzik@...il.com>
Cc: Jeff Layton <jlayton@...nel.org>,
Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>,
Andrew Morton <akpm@...ux-foundation.org>,
Josef Bacik <josef@...icpanda.com>, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] fs: try an opportunistic lookup for O_CREAT opens too
> Before I get to the vfs layer, there is a significant loss in the
> memory allocator because of memcg -- it takes several irq off/on trips
> for every alloc (needed to grab struct file *). I have a plan what to
> do with it (handle stuff with local cmpxchg (note no lock prefix)),
> which I'm trying to get around to. Apart from that you may note the
> allocator fast path performs a 16-byte cmpxchg, which is again dog
> slow and executes twice (once for the file obj, another time for the
> namei buffer). Someone(tm) should patch it up and I have some vague
> ideas, but 0 idea when I can take a serious stab.
I just LBR sampled it on my skylake and it doesn't look
particularly slow. You see the whole massive block including CMPXCHG16
gets IPC 2.7, which is rather good. If you see lots of cycles on it it's likely
a missing cache line.
kmem_cache_free:
ffffffff9944ce20 nop %edi, %edx
ffffffff9944ce24 nopl %eax, (%rax,%rax,1)
ffffffff9944ce29 pushq %rbp
ffffffff9944ce2a mov %rdi, %rdx
ffffffff9944ce2d mov %rsp, %rbp
ffffffff9944ce30 pushq %r15
ffffffff9944ce32 pushq %r14
ffffffff9944ce34 pushq %r13
ffffffff9944ce36 pushq %r12
ffffffff9944ce38 mov $0x80000000, %r12d
ffffffff9944ce3e pushq %rbx
ffffffff9944ce3f mov %rsi, %rbx
ffffffff9944ce42 and $0xfffffffffffffff0, %rsp
ffffffff9944ce46 sub $0x10, %rsp
ffffffff9944ce4a movq %gs:0x28, %rax
ffffffff9944ce53 movq %rax, 0x8(%rsp)
ffffffff9944ce58 xor %eax, %eax
ffffffff9944ce5a add %rsi, %r12
ffffffff9944ce5d jb 0xffffffff9944d1ea
ffffffff9944ce63 mov $0xffffffff80000000, %rax
ffffffff9944ce6a xor %r13d, %r13d
ffffffff9944ce6d subq 0x17b068c(%rip), %rax
ffffffff9944ce74 add %r12, %rax
ffffffff9944ce77 shr $0xc, %rax
ffffffff9944ce7b shl $0x6, %rax
ffffffff9944ce7f addq 0x17b066a(%rip), %rax
ffffffff9944ce86 movq 0x8(%rax), %rcx
ffffffff9944ce8a test $0x1, %cl
ffffffff9944ce8d jnz 0xffffffff9944d15c
ffffffff9944ce93 nopl %eax, (%rax,%rax,1)
ffffffff9944ce98 movq (%rax), %rcx
ffffffff9944ce9b and $0x8, %ch
ffffffff9944ce9e jz 0xffffffff9944cfea
ffffffff9944cea4 test %rax, %rax
ffffffff9944cea7 jz 0xffffffff9944cfea
ffffffff9944cead movq 0x8(%rax), %r14
ffffffff9944ceb1 test %r14, %r14
ffffffff9944ceb4 jz 0xffffffff9944cfac
ffffffff9944ceba cmp %r14, %rdx
ffffffff9944cebd jnz 0xffffffff9944d165
ffffffff9944cec3 test %r14, %r14
ffffffff9944cec6 jz 0xffffffff9944cfac
ffffffff9944cecc movq 0x8(%rbp), %r15
ffffffff9944ced0 nopl %eax, (%rax,%rax,1)
ffffffff9944ced5 movq 0x1fe5134(%rip), %rax
ffffffff9944cedc test %r13, %r13
ffffffff9944cedf jnz 0xffffffff9944ceef
ffffffff9944cee1 mov $0xffffffff80000000, %rax
ffffffff9944cee8 subq 0x17b0611(%rip), %rax
ffffffff9944ceef add %rax, %r12
ffffffff9944cef2 shr $0xc, %r12
ffffffff9944cef6 shl $0x6, %r12
ffffffff9944cefa addq 0x17b05ef(%rip), %r12
ffffffff9944cf01 movq 0x8(%r12), %rax
ffffffff9944cf06 mov %r12, %r13
ffffffff9944cf09 test $0x1, %al
ffffffff9944cf0b jnz 0xffffffff9944d1b1
ffffffff9944cf11 nopl %eax, (%rax,%rax,1)
ffffffff9944cf16 movq (%r13), %rax
ffffffff9944cf1a movq %rbx, (%rsp)
ffffffff9944cf1e test $0x8, %ah
ffffffff9944cf21 mov $0x0, %eax
ffffffff9944cf26 cmovz %rax, %r13
ffffffff9944cf2a data16 nop
ffffffff9944cf2c movq 0x38(%r13), %r8
ffffffff9944cf30 cmp $0x3, %r8
ffffffff9944cf34 jnbe 0xffffffff9944d1ca
ffffffff9944cf3a nopl %eax, (%rax,%rax,1)
ffffffff9944cf3f movq 0x23d6f72(%rip), %rax
ffffffff9944cf46 mov %rbx, %rdx
ffffffff9944cf49 sub %rax, %rdx
ffffffff9944cf4c cmp $0x1fffff, %rdx
ffffffff9944cf53 jbe 0xffffffff9944d03a
ffffffff9944cf59 movq (%r14), %rax
ffffffff9944cf5c addq %gs:0x66bccab4(%rip), %rax
ffffffff9944cf64 movq 0x8(%rax), %rdx
ffffffff9944cf68 cmpq %r13, 0x10(%rax)
ffffffff9944cf6c jnz 0xffffffff9944d192
ffffffff9944cf72 movl 0x28(%r14), %ecx
ffffffff9944cf76 movq (%rax), %rax
ffffffff9944cf79 add %rbx, %rcx
ffffffff9944cf7c cmp %rbx, %rax
ffffffff9944cf7f jz 0xffffffff9944d1ba
ffffffff9944cf85 movq 0xb8(%r14), %rsi
ffffffff9944cf8c mov %rcx, %rdi
ffffffff9944cf8f bswap %rdi
ffffffff9944cf92 xor %rax, %rsi
ffffffff9944cf95 xor %rdi, %rsi
ffffffff9944cf98 movq %rsi, (%rcx)
ffffffff9944cf9b leaq 0x2000(%rdx), %rcx
ffffffff9944cfa2 movq (%r14), %rsi
ffffffff9944cfa5 cmpxchg16bx %gs:(%rsi)
ffffffff9944cfaa jnz 0xffffffff9944cf59
ffffffff9944cfac movq 0x8(%rsp), %rax
ffffffff9944cfb1 subq %gs:0x28, %rax
ffffffff9944cfba jnz 0xffffffff9944d1fc
ffffffff9944cfc0 leaq -0x28(%rbp), %rsp
ffffffff9944cfc4 popq %rbx
ffffffff9944cfc5 popq %r12
ffffffff9944cfc7 popq %r13
ffffffff9944cfc9 popq %r14
ffffffff9944cfcb popq %r15
ffffffff9944cfcd popq %rbp
ffffffff9944cfce retq # PRED 38 cycles [126] 2.74 IPC <-------------
Powered by blists - more mailing lists