[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5c565b2c-85a5-9141-112f-be854cccc558@gmail.com>
Date: Fri, 6 Aug 2021 12:08:59 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Cong Wang <xiyou.wangcong@...il.com>, netdev@...r.kernel.org
Cc: Qitao Xu <qitao.xu@...edance.com>,
Cong Wang <cong.wang@...edance.com>
Subject: Re: [Patch net-next 02/13] ipv4: introduce tracepoint
trace_ip_queue_xmit()
On 8/5/21 8:57 PM, Cong Wang wrote:
> From: Qitao Xu <qitao.xu@...edance.com>
>
> Tracepoint trace_ip_queue_xmit() is introduced to trace skb
> at the entrance of IP layer on TX side.
>
> Reviewed-by: Cong Wang <cong.wang@...edance.com>
> Signed-off-by: Qitao Xu <qitao.xu@...edance.com>
> ---
> include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++
> net/ipv4/ip_output.c | 10 +++++++++-
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h
> index 008f821ebc50..553ae7276732 100644
> --- a/include/trace/events/ip.h
> +++ b/include/trace/events/ip.h
> @@ -41,6 +41,48 @@
> TP_STORE_V4MAPPED(__entry, saddr, daddr)
> #endif
>
> +TRACE_EVENT(ip_queue_xmit,
> +
> + TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> +
> + TP_ARGS(sk, skb),
> +
> + TP_STRUCT__entry(
> + __field(const void *, skbaddr)
> + __field(const void *, skaddr)
> + __field(__u16, sport)
> + __field(__u16, dport)
> + __array(__u8, saddr, 4)
> + __array(__u8, daddr, 4)
> + __array(__u8, saddr_v6, 16)
> + __array(__u8, daddr_v6, 16)
> + ),
> +
> + TP_fast_assign(
> + struct inet_sock *inet = inet_sk(sk);
> + __be32 *p32;
> +
> + __entry->skbaddr = skb;
> + __entry->skaddr = sk;
> +
> + __entry->sport = ntohs(inet->inet_sport);
> + __entry->dport = ntohs(inet->inet_dport);
> +
> + p32 = (__be32 *) __entry->saddr;
> + *p32 = inet->inet_saddr;
> +
> + p32 = (__be32 *) __entry->daddr;
> + *p32 = inet->inet_daddr;
> +
> + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> + ),
> +
> + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px",
> + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> + __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr)
> +);
> +
> #endif /* _TRACE_IP_H */
>
> /* This part must be outside protection */
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 6b04a88466b2..dcf94059112e 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -82,6 +82,7 @@
> #include <linux/netfilter_bridge.h>
> #include <linux/netlink.h>
> #include <linux/tcp.h>
> +#include <trace/events/ip.h>
>
> static int
> ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
> @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit);
>
> int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
> {
> - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> + int ret;
> +
> + ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> + if (!ret)
> + trace_ip_queue_xmit(sk, skb);
> +
> + return ret;
> +
> }
> EXPORT_SYMBOL(ip_queue_xmit);
>
>
While it is useful to have stuff like this,
ddding so many trace points has a certain cost.
I fear that you have not determined this cost
on workloads where we enter these functions with cold caches.
For instance, before this patch, compiler gives us :
2e10 <ip_queue_xmit>:
2e10: e8 00 00 00 00 callq 2e15 <ip_queue_xmit+0x5> (__fentry__-0x4)
2e15: 0f b6 8f 1c 03 00 00 movzbl 0x31c(%rdi),%ecx
2e1c: e9 ef fb ff ff jmpq 2a10 <__ip_queue_xmit>
After patch, we see the compiler had to save/restore registers, and no longer
jumps to __ip_queue_xmit. Code is bigger, even when tracepoint is not enabled.
2e10: e8 00 00 00 00 callq 2e15 <ip_queue_xmit+0x5>
2e11: R_X86_64_PLT32 __fentry__-0x4
2e15: 41 55 push %r13
2e17: 49 89 f5 mov %rsi,%r13
2e1a: 41 54 push %r12
2e1c: 55 push %rbp
2e1d: 0f b6 8f 1c 03 00 00 movzbl 0x31c(%rdi),%ecx
2e24: 48 89 fd mov %rdi,%rbp
2e27: e8 00 00 00 00 callq 2e2c <ip_queue_xmit+0x1c>
2e28: R_X86_64_PLT32 __ip_queue_xmit-0x4
2e2c: 41 89 c4 mov %eax,%r12d
2e2f: 85 c0 test %eax,%eax
2e31: 74 09 je 2e3c <ip_queue_xmit+0x2c>
2e33: 44 89 e0 mov %r12d,%eax
2e36: 5d pop %rbp
2e37: 41 5c pop %r12
2e39: 41 5d pop %r13
2e3b: c3 retq
2e3c: 66 90 xchg %ax,%ax
2e3e: 44 89 e0 mov %r12d,%eax
2e41: 5d pop %rbp
2e42: 41 5c pop %r12
2e44: 41 5d pop %r13
2e46: c3 retq
---- tracing code ---
2e47: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # 2e4e <ip_queue_xmit+0x3e>
2e4a: R_X86_64_PC32 cpu_number-0x4
2e4e: 89 c0 mov %eax,%eax
2e50: 48 0f a3 05 00 00 00 bt %rax,0x0(%rip) # 2e58 <ip_queue_xmit+0x48>
2e57: 00
2e54: R_X86_64_PC32 __cpu_online_mask-0x4
2e58: 73 d9 jae 2e33 <ip_queue_xmit+0x23>
2e5a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 2e61 <ip_queue_xmit+0x51>
2e5d: R_X86_64_PC32 __tracepoint_ip_queue_xmit+0x3c
2e61: 48 85 c0 test %rax,%rax
2e64: 74 0f je 2e75 <ip_queue_xmit+0x65>
2e66: 48 8b 78 08 mov 0x8(%rax),%rdi
2e6a: 4c 89 ea mov %r13,%rdx
2e6d: 48 89 ee mov %rbp,%rsi
2e70: e8 00 00 00 00 callq 2e75 <ip_queue_xmit+0x65>
2e71: R_X86_64_PLT32 __SCT__tp_func_ip_queue_xmit-0x4
2e75: 44 89 e0 mov %r12d,%eax
2e78: 5d pop %rbp
2e79: 41 5c pop %r12
2e7b: 41 5d pop %r13
2e7d: c3 retq
Powered by blists - more mailing lists