linux-kernel - [PATCH v2] sock: add tracepoint for send recv length

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230105100014.1935-1-cuiyunhui@bytedance.com>
Date:   Thu,  5 Jan 2023 18:00:14 +0800
From:   Yunhui Cui <cuiyunhui@...edance.com>
To:     rostedt@...dmis.org, mhiramat@...nel.org, davem@...emloft.net,
        edumazet@...gle.com, kuba@...nel.org, pabeni@...hat.com,
        cuiyunhui@...edance.com, duanxiongchun@...edance.com,
        kuniyu@...zon.com, linux-kernel@...r.kernel.org,
        linux-trace-kernel@...r.kernel.org
Subject: [PATCH v2] sock: add tracepoint for send recv length

Add 2 tracepoints to monitor the tcp/udp traffic
of per process and per cgroup.

Regarding monitoring the tcp/udp traffic of each process, the existing
implementation is https://www.atoptool.nl/netatop.php.
This solution is implemented by registering the hook function at the hook
point provided by the netfilter framework.

These hook functions may be in the soft interrupt context and cannot
directly obtain the pid. Some data structures are added to bind packets
and processes. For example, struct taskinfobucket, struct taskinfo ...

Every time the process sends and receives packets it needs multiple
hashmaps,resulting in low performance and it has the problem fo inaccurate
tcp/udp traffic statistics(for example: multiple threads share sockets).

Based on these 2 tracepoints, we have optimized and tested performance.
Time Per Request as an indicator, without monitoring: 50.95ms,
netatop: 63.27 ms, Hook on these tracepoints: 52.24ms.
The performance has been improved 10 times. The tcp/udp traffic of each
process has also been accurately counted.

We can obtain the information with kretprobe, but as we know, kprobe gets
the result by trappig in an exception, which loses performance compared
to tracepoint. We did a test for performance comparison. The results are
as follows. Time per request, sock_sendmsg(k,kr):  12.382ms,
tcp_send_length(tracepoint): 11.887ms，without hook：11.222ms

It can be seen that the performance loss of tracepoint is only half of
that of kprobe.

Signed-off-by: Yunhui Cui <cuiyunhui@...edance.com>
Signed-off-by: Xiongchun Duan <duanxiongchun@...edance.com>
---
 include/trace/events/sock.h | 50 +++++++++++++++++++++++++++++++++++++
 net/socket.c                | 23 +++++++++++++++--
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 777ee6cbe933..9a85b455a877 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -263,6 +263,56 @@ TRACE_EVENT(inet_sk_error_report,
 		  __entry->error)
 );
 
+/*
+ * sock send/recv msg length
+ * * Note: this class requires positive integer
+ */
+DECLARE_EVENT_CLASS(sock_msg_length,
+
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int length,
+		int error, int flags),
+
+	TP_ARGS(sk, family, protocol, length, error, flags),
+
+	TP_STRUCT__entry(
+		__field(void *, sk)
+		__field(__u16, family)
+		__field(__u16, protocol)
+		__field(int, length)
+		__field(int, error)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->family = sk->sk_family;
+		__entry->protocol = sk->sk_protocol;
+		__entry->length = length;
+		__entry->error = error;
+		__entry->flags = flags;
+	),
+
+	TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
+			__entry->sk,
+			show_family_name(__entry->family),
+			show_inet_protocol_name(__entry->protocol),
+			__entry->length,
+			__entry->error, __entry->flags)
+);
+
+DEFINE_EVENT(sock_msg_length, sock_sendmsg_length,
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int length,
+		 int error, int flags),
+
+	TP_ARGS(sk, family, protocol, length, error, flags)
+);
+
+DEFINE_EVENT(sock_msg_length, sock_recvmsg_length,
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int length,
+		 int error, int flags),
+
+	TP_ARGS(sk, family, protocol, length, error, flags)
+);
 #endif /* _TRACE_SOCK_H */
 
 /* This part must be outside protection */
diff --git a/net/socket.c b/net/socket.c
index 888cd618a968..cd3887e12b76 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -106,6 +106,7 @@
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
 #include <linux/ptp_clock_kernel.h>
+#include <trace/events/sock.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
@@ -715,6 +716,10 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
 				     inet_sendmsg, sock, msg,
 				     msg_data_left(msg));
 	BUG_ON(ret == -EIOCBQUEUED);
+
+	trace_sock_sendmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol,
+				  ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
 	return ret;
 }
 
@@ -992,9 +997,17 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 				     int flags)
 {
-	return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
+	int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
 				  inet_recvmsg, sock, msg, msg_data_left(msg),
 				  flags);
+
+	trace_sock_recvmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol,
+				  (ret > 0 && !(flags & MSG_PEEK)) ?
+				  ret : 0,
+				  (ret > 0 && !(flags & MSG_PEEK)) ? 0 : ret,
+				  flags);
+	return ret;
 }
 
 /**
@@ -1044,6 +1057,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 {
 	struct socket *sock;
 	int flags;
+	int ret;
 
 	sock = file->private_data;
 
@@ -1051,7 +1065,12 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
 	flags |= more;
 
-	return kernel_sendpage(sock, page, offset, size, flags);
+	ret = kernel_sendpage(sock, page, offset, size, flags);
+
+	trace_sock_sendmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol,
+				  ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
+	return ret;
 }
 
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
-- 
2.20.1