[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89i+W__5-jDUdM=_97jzQy9Wq+n9KBEuOGjUi=Fxe_ntqbg@mail.gmail.com>
Date: Mon, 9 Jan 2023 10:56:04 +0100
From: Eric Dumazet <edumazet@...gle.com>
To: Yunhui Cui <cuiyunhui@...edance.com>
Cc: rostedt@...dmis.org, mhiramat@...nel.org, davem@...emloft.net,
kuba@...nel.org, pabeni@...hat.com, kuniyu@...zon.com,
xiyou.wangcong@...il.com, duanxiongchun@...edance.com,
linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
netdev@...r.kernel.org
Subject: Re: [PATCH v4] sock: add tracepoint for send recv length
On Sun, Jan 8, 2023 at 3:56 AM Yunhui Cui <cuiyunhui@...edance.com> wrote:
>
> Add 2 tracepoints to monitor the tcp/udp traffic
> of per process and per cgroup.
>
> Regarding monitoring the tcp/udp traffic of each process, there are two
> existing solutions, the first one is https://www.atoptool.nl/netatop.php.
> The second is via kprobe/kretprobe.
>
> Netatop solution is implemented by registering the hook function at the
> hook point provided by the netfilter framework.
>
> These hook functions may be in the soft interrupt context and cannot
> directly obtain the pid. Some data structures are added to bind packets
> and processes. For example, struct taskinfobucket, struct taskinfo ...
>
> Every time the process sends and receives packets it needs multiple
> hashmaps,resulting in low performance and it has the problem fo inaccurate
> tcp/udp traffic statistics(for example: multiple threads share sockets).
>
> We can obtain the information with kretprobe, but as we know, kprobe gets
> the result by trappig in an exception, which loses performance compared
> to tracepoint.
>
> We compared the performance of tracepoints with the above two methods, and
> the results are as follows:
>
> ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html
> without trace:
> Time per request: 39.660 [ms] (mean)
> Time per request: 0.040 [ms] (mean, across all concurrent requests)
>
> netatop:
> Time per request: 50.717 [ms] (mean)
> Time per request: 0.051 [ms] (mean, across all concurrent requests)
>
> kr:
> Time per request: 43.168 [ms] (mean)
> Time per request: 0.043 [ms] (mean, across all concurrent requests)
>
> tracepoint:
> Time per request: 41.004 [ms] (mean)
> Time per request: 0.041 [ms] (mean, across all concurrent requests
>
> It can be seen that tracepoint has better performance.
>
> Signed-off-by: Yunhui Cui <cuiyunhui@...edance.com>
> Signed-off-by: Xiongchun Duan <duanxiongchun@...edance.com>
> ---
> include/trace/events/sock.h | 48 +++++++++++++++++++++++++++++++++++++
> net/socket.c | 23 ++++++++++++++----
> 2 files changed, 67 insertions(+), 4 deletions(-)
>
> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
> index 777ee6cbe933..d00a5b272404 100644
> --- a/include/trace/events/sock.h
> +++ b/include/trace/events/sock.h
> @@ -263,6 +263,54 @@ TRACE_EVENT(inet_sk_error_report,
> __entry->error)
> );
>
> +/*
> + * sock send/recv msg length
> + */
> +DECLARE_EVENT_CLASS(sock_msg_length,
> +
> + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> + int flags),
> +
> + TP_ARGS(sk, family, protocol, ret, flags),
> +
> + TP_STRUCT__entry(
> + __field(void *, sk)
> + __field(__u16, family)
> + __field(__u16, protocol)
> + __field(int, length)
> + __field(int, error)
> + __field(int, flags)
> + ),
> +
> + TP_fast_assign(
> + __entry->sk = sk;
> + __entry->family = sk->sk_family;
> + __entry->protocol = sk->sk_protocol;
> + __entry->length = ret > 0 ? ret : 0;
> + __entry->error = ret < 0 ? ret : 0;
> + __entry->flags = flags;
> + ),
> +
> + TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
> + __entry->sk, show_family_name(__entry->family),
> + show_inet_protocol_name(__entry->protocol),
> + __entry->length,
> + __entry->error, __entry->flags)
> +);
> +
> +DEFINE_EVENT(sock_msg_length, sock_send_length,
> + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> + int flags),
> +
> + TP_ARGS(sk, family, protocol, ret, flags)
> +);
> +
> +DEFINE_EVENT(sock_msg_length, sock_recv_length,
> + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> + int flags),
> +
> + TP_ARGS(sk, family, protocol, ret, flags)
> +);
> #endif /* _TRACE_SOCK_H */
>
> /* This part must be outside protection */
> diff --git a/net/socket.c b/net/socket.c
> index 888cd618a968..60a1ff95b4b1 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -106,6 +106,7 @@
> #include <net/busy_poll.h>
> #include <linux/errqueue.h>
> #include <linux/ptp_clock_kernel.h>
> +#include <trace/events/sock.h>
>
> #ifdef CONFIG_NET_RX_BUSY_POLL
> unsigned int sysctl_net_busy_read __read_mostly;
> @@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
> inet_sendmsg, sock, msg,
> msg_data_left(msg));
> BUG_ON(ret == -EIOCBQUEUED);
> +
> + trace_sock_send_length(sock->sk, sock->sk->sk_family,
> + sock->sk->sk_protocol, ret, 0);
Note: At least for CONFIG_RETPOLINE=y and gcc 12.2, compiler adds many
additional instructions (and additional memory reads),
even when the trace point is not enabled.
Contrary to some belief, adding a tracepoint is not always 'free'.
tail calls for example are replaced with normal calls.
sock_recvmsg_nosec:
pushq %r12 #
movl %edx, %r12d # tmp123, flags
pushq %rbp #
# net/socket.c:999: int ret =
INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
movl %r12d, %ecx # flags,
# net/socket.c:998: {
movq %rdi, %rbp # tmp121, sock
pushq %rbx #
# net/socket.c:999: int ret =
INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
movq 32(%rdi), %rax # sock_19(D)->ops, sock_19(D)->ops
# ./include/linux/uio.h:270: return i->count;
movq 32(%rsi), %rdx # MEM[(const struct iov_iter
*)msg_20(D) + 16B].count, pretmp_48
# net/socket.c:999: int ret =
INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
movq 144(%rax), %rax # _1->recvmsg, _2
cmpq $inet6_recvmsg, %rax #, _2
jne .L107 #,
call inet6_recvmsg #
movl %eax, %ebx # tmp124, <retval>
.L108:
# net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family,
xorl %r8d, %r8d # tmp127
testl %ebx, %ebx # <retval>
# net/socket.c:1004: sock->sk->sk_protocol,
movq 24(%rbp), %rsi # sock_19(D)->sk, _10
# net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family,
cmovle %ebx, %r8d # <retval>,, tmp119
testb $2, %r12b #, flags
# net/socket.c:1004: sock->sk->sk_protocol,
movzwl 516(%rsi), %ecx # _10->sk_protocol,
# net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family,
movzwl 16(%rsi), %edx # _10->__sk_common.skc_family,
# net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family,
cmove %ebx, %r8d # tmp119,, <retval>, iftmp.54_16
# ./arch/x86/include/asm/jump_label.h:27: asm_volatile_goto("1:"
#APP
# 27 "./arch/x86/include/asm/jump_label.h" 1
1:jmp .L111 # objtool NOPs this #
.pushsection __jump_table, "aw"
.balign 8
.long 1b - .
.long .L111 - . #
.quad __tracepoint_sock_recv_length+8 + 2 - . #,
.popsection
# 0 "" 2
#NO_APP
.L106:
# net/socket.c:1008: }
movl %ebx, %eax # <retval>,
popq %rbx #
popq %rbp #
popq %r12 #
ret
.L111:
# ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length,
sock_recv_length,
> return ret;
> }
>
> @@ -992,9 +996,15 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
> static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
> int flags)
> {
> - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
> - inet_recvmsg, sock, msg, msg_data_left(msg),
> - flags);
> + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
> + inet_recvmsg, sock, msg,
> + msg_data_left(msg), flags);
> +
> + trace_sock_recv_length(sock->sk, sock->sk->sk_family,
> + sock->sk->sk_protocol,
> + !(flags & MSG_PEEK) ? ret :
> + (ret < 0 ? ret : 0), flags);
> + return ret;
> }
>
> /**
> @@ -1044,6 +1054,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
> {
> struct socket *sock;
> int flags;
> + int ret;
>
> sock = file->private_data;
>
> @@ -1051,7 +1062,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
> /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
> flags |= more;
>
> - return kernel_sendpage(sock, page, offset, size, flags);
> + ret = kernel_sendpage(sock, page, offset, size, flags);
> +
> + trace_sock_send_length(sock->sk, sock->sk->sk_family,
> + sock->sk->sk_protocol, ret, 0);
> + return ret;
> }
>
> static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
> --
> 2.20.1
>
Powered by blists - more mailing lists