linux-kernel - Re: [PATCH net-next] net: tcp: add tracepoint skb

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <090a2144-b8f6-4393-881d-9977eec6b13d@linux.dev>
Date: Wed, 9 Oct 2024 16:35:58 +0100
From: Vadim Fedorenko <vadim.fedorenko@...ux.dev>
To: Menglong Dong <menglong8.dong@...il.com>, kuba@...nel.org,
 Jason Xing <kerneljasonxing@...il.com>, Willem de Bruijn <willemb@...gle.com>
Cc: davem@...emloft.net, edumazet@...gle.com, pabeni@...hat.com,
 rostedt@...dmis.org, mhiramat@...nel.org, mathieu.desnoyers@...icios.com,
 dsahern@...nel.org, yan@...udflare.com, dongml2@...natelecom.cn,
 netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
 linux-trace-kernel@...r.kernel.org
Subject: Re: [PATCH net-next] net: tcp: add tracepoint skb_latency for latency
 monitor

On 09/10/2024 13:17, Menglong Dong wrote:
> In this commit, we introduce a new tracepoint "skb_latency", which is
> used to trace the latency on sending or receiving packet. For now, only
> TCP is supported. Maybe we should call it "tcp_latency"?
> 
> There are 6 stages are introduced in this commit to trace the networking
> latency.
> 
> The existing SO_TIMESTAMPING and MSG_TSTAMP_* can obtain the timestamping
> of sending and receiving packet, but it's not convenient.
> 
> First, most applications didn't use this function when implement, and we
> can't make them implement it right now when networking latency happens.
> 
> Second, it's inefficient, as it need to get the timestamping from the
> error queue with syscalls.
> 
> Third, the timestamping it offers is not enough to analyse the latency
> on sending or receiving packet.
> 
> As for me, the main usage of this tracepoint is to be hooked by my BPF
> program, and do some filter, and capture the latency that I interested
> in.

Hi Menglong,

Looks like you are trying to solve the problem similar to what Jason is
trying to solve in a different thread:

https://lore.kernel.org/netdev/20241008095109.99918-1-kerneljasonxing@gmail.com/

I also think it's good idea to add bpf@...r.kernel.org mailing list.

See one more comment below.

> Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
> ---
>   include/linux/skbuff.h     | 32 ++++++++++++++++++++++++++++++++
>   include/trace/events/skb.h | 30 ++++++++++++++++++++++++++++++
>   net/core/dev.c             |  2 ++
>   net/core/net-traces.c      |  1 +
>   net/core/skbuff.c          | 20 ++++++++++++++++++++
>   net/ipv4/tcp.c             |  5 +++++
>   net/ipv4/tcp_input.c       |  1 +
>   net/ipv4/tcp_ipv4.c        |  2 ++
>   net/ipv4/tcp_output.c      |  7 +++++++
>   9 files changed, 100 insertions(+)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 39f1d16f3628..77fcda96f1fd 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1104,6 +1104,35 @@ struct sk_buff {
>   #endif
>   #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
>   
> +enum skb_latency_type {
> +	/* the latency from the skb being queued in the send buffer to the
> +	 * skb is passed to L3 from L4. The latency in this case can be
> +	 * caused by the nagle.
> +	 */
> +	SKB_LATENCY_SEND,
> +	/* the latency from L3 to the skb entering the paccket scheduler
> +	 * in output path.
> +	 */
> +	SKB_LATENCY_SCHED,
> +	/* the latency from L3 to the skb entering the driver in output path */
> +	SKB_LATENCY_NIC,
> +	/* the latency from L3 to the skb being acknowledged by peer. This
> +	 * including the latency caused by delay ack. If the skb is
> +	 * retransmitted, this imply the last retransmitted skb.
> +	 */
> +	SKB_LATENCY_ACK,
> +	/* the latency from the driver to the skb entering the L4 in input path */
> +	SKB_LATENCY_RECV,
> +	/* the latency from the driver to the skb being peeked from the
> +	 * recv queue by the user in input path.
> +	 */
> +	SKB_LATENCY_PICK,
> +	SKB_LATENCY_MAX,
> +};
> +
> +extern int skb_latency_regfunc(void);
> +extern void skb_latency_unregfunc(void);
> +
>   #ifdef __KERNEL__
>   /*
>    *	Handling routines are only of interest to the kernel
> @@ -4500,6 +4529,9 @@ static inline void skb_tx_timestamp(struct sk_buff *skb)
>   		skb_tstamp_tx(skb, NULL);
>   }
>   
> +void skb_latency_notify(struct sk_buff *skb, struct sock *sk,
> +			enum skb_latency_type type);
> +
>   /**
>    * skb_complete_wifi_ack - deliver skb with wifi status
>    *
> diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
> index b877133cd93a..26825dac4347 100644
> --- a/include/trace/events/skb.h
> +++ b/include/trace/events/skb.h
> @@ -92,6 +92,36 @@ TRACE_EVENT(skb_copy_datagram_iovec,
>   	TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
>   );
>   
> +TRACE_EVENT_FN(skb_latency,
> +
> +	TP_PROTO(struct sk_buff *skb, struct sock *sk, enum skb_latency_type type),
> +
> +	TP_ARGS(skb, sk, type),
> +
> +	TP_STRUCT__entry(
> +		__field(void *,		skbaddr)
> +		__field(void *,		skaddr)
> +		__field(u64,		latency)
> +		__field(enum skb_latency_type,	type)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->skbaddr = skb;
> +		__entry->skaddr = sk;
> +		__entry->type = type;
> +		__entry->latency = skb->tstamp ?
> +			(skb->tstamp_type == SKB_CLOCK_REALTIME ?
> +				net_timedelta(skb->tstamp) :
> +				ktime_get_ns() - skb->tstamp) : 0;
> +	),
> +
> +	TP_printk("skbaddr=%p skaddr=%p type=%d latency=%lluns",
> +		  __entry->skbaddr, __entry->skaddr, __entry->type,
> +		  __entry->latency),
> +
> +	skb_latency_regfunc, skb_latency_unregfunc
> +);
> +
>   #endif /* _TRACE_SKB_H */
>   
>   /* This part must be outside protection */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index ea5fbcd133ae..ab036bbadc9a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4347,6 +4347,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
>   	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
>   		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
>   
> +	skb_latency_notify(skb, NULL, SKB_LATENCY_SCHED);
> +
>   	/* Disable soft irqs for various locks below. Also
>   	 * stops preemption for RCU.
>   	 */
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index f2fa34b1d78d..a0c1e32a5928 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -57,6 +57,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release);
>   
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(skb_latency);
>   
>   EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
>   
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 00afeb90c23a..041947d7049c 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5521,6 +5521,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>   {
>   	struct sock *sk = skb->sk;
>   
> +	skb_latency_notify(skb, sk, SKB_LATENCY_NIC);
>   	if (!skb_may_tx_timestamp(sk, false))
>   		goto err;
>   
> @@ -5539,6 +5540,24 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>   }
>   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>   
> +void skb_latency_notify(struct sk_buff *skb, struct sock *sk,
> +			enum skb_latency_type type)
> +{
> +	trace_skb_latency(skb, sk, type);
> +}
> +EXPORT_SYMBOL_GPL(skb_latency_notify);
> +
> +int skb_latency_regfunc(void)
> +{
> +	net_enable_timestamp();
> +	return 0;
> +}
> +
> +void skb_latency_unregfunc(void)
> +{
> +	net_disable_timestamp();
> +}
> +
>   void __skb_tstamp_tx(struct sk_buff *orig_skb,
>   		     const struct sk_buff *ack_skb,
>   		     struct skb_shared_hwtstamps *hwtstamps,
> @@ -5599,6 +5618,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
>   void skb_tstamp_tx(struct sk_buff *orig_skb,
>   		   struct skb_shared_hwtstamps *hwtstamps)
>   {
> +	skb_latency_notify(orig_skb, NULL, SKB_LATENCY_NIC);
>   	return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
>   			       SCM_TSTAMP_SND);
>   }
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 82cc4a5633ce..b4e7bd9c4b6f 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -283,6 +283,7 @@
>   #include <net/busy_poll.h>
>   #include <net/hotdata.h>
>   #include <trace/events/tcp.h>
> +#include <trace/events/skb.h>
>   #include <net/rps.h>
>   
>   #include "../core/devmem.h"
> @@ -688,6 +689,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
>   		tp->nonagle &= ~TCP_NAGLE_PUSH;
>   
>   	tcp_slow_start_after_idle_check(sk);
> +	skb_set_delivery_time(skb, tp->tcp_clock_cache, SKB_CLOCK_MONOTONIC);
>   }
>   
>   static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
> @@ -1137,6 +1139,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
>   
>   	/* Ok commence sending. */
>   	copied = 0;
> +	tcp_mstamp_refresh(tp);
>   
>   restart:
>   	mss_now = tcp_send_mss(sk, &size_goal, flags);
> @@ -1318,6 +1321,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
>   			goto do_error;
>   
>   		mss_now = tcp_send_mss(sk, &size_goal, flags);
> +		tcp_mstamp_refresh(tp);
>   	}
>   
>   out:
> @@ -1519,6 +1523,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
>   
>   static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
>   {
> +	skb_latency_notify(skb, sk, SKB_LATENCY_PICK);
>   	__skb_unlink(skb, &sk->sk_receive_queue);
>   	if (likely(skb->destructor == sock_rfree)) {
>   		sock_rfree(skb);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index cc05ec1faac8..99218ec5faa8 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3293,6 +3293,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
>   {
>   	const struct skb_shared_info *shinfo;
>   
> +	skb_latency_notify(skb, sk, SKB_LATENCY_ACK);
>   	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
>   	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
>   		return;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 985028434f64..de7a9fd7773a 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -85,6 +85,7 @@
>   #include <linux/scatterlist.h>
>   
>   #include <trace/events/tcp.h>
> +#include <trace/events/skb.h>
>   
>   #ifdef CONFIG_TCP_MD5SIG
>   static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
> @@ -2222,6 +2223,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
>   	if (!sk)
>   		goto no_tcp_socket;
>   
> +	skb_latency_notify(skb, sk, SKB_LATENCY_RECV);
>   	if (sk->sk_state == TCP_TIME_WAIT)
>   		goto do_time_wait;
>   

Why don't you do the same notify for TCP over IPv6?

> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 08772395690d..db4bae23986f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -48,6 +48,7 @@
>   #include <linux/skbuff_ref.h>
>   
>   #include <trace/events/tcp.h>
> +#include <trace/events/skb.h>
>   
>   /* Refresh clocks of a TCP socket,
>    * ensuring monotically increasing values.
> @@ -2512,6 +2513,7 @@ static int tcp_mtu_probe(struct sock *sk)
>   	skb = tcp_send_head(sk);
>   	skb_copy_decrypted(nskb, skb);
>   	mptcp_skb_ext_copy(nskb, skb);
> +	nskb->tstamp = skb->tstamp;
>   
>   	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
>   	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
> @@ -2540,6 +2542,7 @@ static int tcp_mtu_probe(struct sock *sk)
>   			break;
>   	}
>   	tcp_init_tso_segs(nskb, nskb->len);
> +	skb_latency_notify(nskb, sk, SKB_LATENCY_SEND);
>   
>   	/* We're ready to send.  If this fails, the probe will
>   	 * be resegmented into mss-sized pieces by tcp_write_xmit().
> @@ -2827,6 +2830,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
>   		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
>   			break;
>   
> +		skb_latency_notify(skb, sk, SKB_LATENCY_SEND);
>   		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
>   			break;
>   
> @@ -3616,6 +3620,7 @@ void tcp_send_fin(struct sock *sk)
>   		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
>   		tcp_init_nondata_skb(skb, tp->write_seq,
>   				     TCPHDR_ACK | TCPHDR_FIN);
> +		skb_set_delivery_time(skb, tcp_clock_ns(), SKB_CLOCK_MONOTONIC);
>   		tcp_queue_skb(sk, skb);
>   	}
>   	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
> @@ -4048,6 +4053,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
>   		goto done;
>   	}
>   
> +	skb_set_delivery_time(syn_data, tp->tcp_clock_cache, SKB_CLOCK_MONOTONIC);
>   	/* data was not sent, put it in write_queue */
>   	__skb_queue_tail(&sk->sk_write_queue, syn_data);
>   	tp->packets_out -= tcp_skb_pcount(syn_data);
> @@ -4353,6 +4359,7 @@ int tcp_write_wakeup(struct sock *sk, int mib)
>   		} else if (!tcp_skb_pcount(skb))
>   			tcp_set_skb_tso_segs(skb, mss);
>   
> +		skb_latency_notify(skb, sk, SKB_LATENCY_SEND);
>   		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
>   		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
>   		if (!err)