lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9be3733eee16bb81a7e8e2e57ebcc008f95cae08.1717105215.git.yan@cloudflare.com>
Date: Thu, 30 May 2024 14:46:56 -0700
From: Yan Zhai <yan@...udflare.com>
To: netdev@...r.kernel.org
Cc: "David S. Miller" <davem@...emloft.net>,
	Eric Dumazet <edumazet@...gle.com>,
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
	Simon Horman <horms@...nel.org>, David Ahern <dsahern@...nel.org>,
	Abhishek Chauhan <quic_abchauha@...cinc.com>,
	Mina Almasry <almasrymina@...gle.com>,
	Florian Westphal <fw@...len.de>,
	Alexander Lobakin <aleksander.lobakin@...el.com>,
	David Howells <dhowells@...hat.com>, Jiri Pirko <jiri@...nulli.us>,
	Daniel Borkmann <daniel@...earbox.net>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	Lorenzo Bianconi <lorenzo@...nel.org>,
	Pavel Begunkov <asml.silence@...il.com>,
	linux-kernel@...r.kernel.org, kernel-team@...udflare.com,
	Jesper Dangaard Brouer <hawk@...nel.org>
Subject: [RFC net-next 1/6] net: add kfree_skb_for_sk function

Implement a new kfree_skb_for_sk to replace kfree_skb_reason on a few
local receive path. The function accepts an extra receiving socket
argument, which will be set in skb->cb for kfree_skb/consume_skb
tracepoint consumption. With this extra bit of information, it will be
easier to attribute dropped packets to netns/containers and
sockets/services for performance and error monitoring purpose.

Signed-off-by: Yan Zhai <yan@...udflare.com>
---
 include/linux/skbuff.h | 48 ++++++++++++++++++++++++++++++++++++++++--
 net/core/dev.c         | 21 +++++++-----------
 net/core/skbuff.c      | 29 +++++++++++++------------
 3 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe7d8dbef77e..66f5b06798f2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1251,8 +1251,52 @@ static inline bool skb_data_unref(const struct sk_buff *skb,
 	return true;
 }
 
-void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+/*
+ * Some protocol will clear or reuse skb->dev field for other purposes.  For
+ * example, TCP stack would reuse the pointer for out of order packet handling.
+ * This caused some problem for drop monitoring on kfree_skb tracepoint, since
+ * no other fields of an skb provides netns information.  In addition, it is
+ * also complicated to recover receive socket information for dropped packets,
+ * because the socket lookup can be an sk-lookup BPF program.
+ *
+ * This can be addressed by just passing the rx socket to the tracepoint,
+ * because it also has valid netns binding.
+ */
+struct kfree_skb_cb {
+	enum skb_drop_reason reason; /* used only by dev_kfree_skb_irq */
+	struct sock *rx_sk;
+};
+
+#define KFREE_SKB_CB(skb) ((struct kfree_skb_cb *)(skb)->cb)
+
+/* Save cb->rx_sk before calling kfree_skb/consume_skb tracepoint, and restore
+ * after the tracepoint. This is necessary because some skb destructor might
+ * rely on values in skb->cb, e.g. unix_destruct_scm.
+ */
+#define _call_trace_kfree_skb(action, skb, sk, ...)	do {	\
+	if (trace_##action##_skb_enabled()) {			\
+		struct kfree_skb_cb saved;			\
+		saved.rx_sk = KFREE_SKB_CB(skb)->rx_sk;		\
+		KFREE_SKB_CB(skb)->rx_sk = sk;			\
+		trace_##action##_skb((skb), ## __VA_ARGS__);	\
+		KFREE_SKB_CB(skb)->rx_sk = saved.rx_sk;		\
+	}							\
+} while (0)
+
+#define call_trace_kfree_skb(skb, sk, ...) \
+	_call_trace_kfree_skb(kfree, skb, sk, ## __VA_ARGS__)
+
+#define call_trace_consume_skb(skb, sk, ...) \
+	_call_trace_kfree_skb(consume, skb, sk, ## __VA_ARGS__)
+
+void __fix_address kfree_skb_for_sk(struct sk_buff *skb, struct sock *rx_sk,
+				    enum skb_drop_reason reason);
+
+static inline void
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+	kfree_skb_for_sk(skb, NULL, reason);
+}
 
 /**
  *	kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
diff --git a/net/core/dev.c b/net/core/dev.c
index 85fe8138f3e4..17516f26be92 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3135,15 +3135,6 @@ void __netif_schedule(struct Qdisc *q)
 }
 EXPORT_SYMBOL(__netif_schedule);
 
-struct dev_kfree_skb_cb {
-	enum skb_drop_reason reason;
-};
-
-static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
-{
-	return (struct dev_kfree_skb_cb *)skb->cb;
-}
-
 void netif_schedule_queue(struct netdev_queue *txq)
 {
 	rcu_read_lock();
@@ -3182,7 +3173,11 @@ void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 		return;
 	}
-	get_kfree_skb_cb(skb)->reason = reason;
+
+	/* There is no need to save the old cb since we are the only user. */
+	KFREE_SKB_CB(skb)->reason = reason;
+	KFREE_SKB_CB(skb)->rx_sk = NULL;
+
 	local_irq_save(flags);
 	skb->next = __this_cpu_read(softnet_data.completion_queue);
 	__this_cpu_write(softnet_data.completion_queue, skb);
@@ -5229,17 +5224,17 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(refcount_read(&skb->users));
-			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
+			if (likely(KFREE_SKB_CB(skb)->reason == SKB_CONSUMED))
 				trace_consume_skb(skb, net_tx_action);
 			else
 				trace_kfree_skb(skb, net_tx_action,
-						get_kfree_skb_cb(skb)->reason);
+						KFREE_SKB_CB(skb)->reason);
 
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
 			else
 				__napi_kfree_skb(skb,
-						 get_kfree_skb_cb(skb)->reason);
+						 KFREE_SKB_CB(skb)->reason);
 		}
 	}
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 466999a7515e..5ce6996512a1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1190,7 +1190,8 @@ void __kfree_skb(struct sk_buff *skb)
 EXPORT_SYMBOL(__kfree_skb);
 
 static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __kfree_skb_reason(struct sk_buff *skb, struct sock *rx_sk,
+			enum skb_drop_reason reason)
 {
 	if (unlikely(!skb_unref(skb)))
 		return false;
@@ -1201,28 +1202,30 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 				SKB_DROP_REASON_SUBSYS_NUM);
 
 	if (reason == SKB_CONSUMED)
-		trace_consume_skb(skb, __builtin_return_address(0));
+		call_trace_consume_skb(skb, rx_sk, __builtin_return_address(0));
 	else
-		trace_kfree_skb(skb, __builtin_return_address(0), reason);
+		call_trace_kfree_skb(skb, rx_sk, __builtin_return_address(0), reason);
+
 	return true;
 }
 
 /**
- *	kfree_skb_reason - free an sk_buff with special reason
+ *	kfree_skb_for_sk - free an sk_buff with special reason and receiving socket
  *	@skb: buffer to free
+ *	@rx_sk: the socket to receive the buffer, or NULL if not applicable
  *	@reason: reason why this skb is dropped
  *
  *	Drop a reference to the buffer and free it if the usage count has
- *	hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
+ *	hit zero. Meanwhile, pass the drop reason and rx socket to 'kfree_skb'
  *	tracepoint.
  */
-void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+void __fix_address kfree_skb_for_sk(struct sk_buff *skb, struct sock *rx_sk,
+				    enum skb_drop_reason reason)
 {
-	if (__kfree_skb_reason(skb, reason))
+	if (__kfree_skb_reason(skb, rx_sk, reason))
 		__kfree_skb(skb);
 }
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(kfree_skb_for_sk);
 
 #define KFREE_SKB_BULK_SIZE	16
 
@@ -1261,7 +1264,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
 	while (segs) {
 		struct sk_buff *next = segs->next;
 
-		if (__kfree_skb_reason(segs, reason)) {
+		if (__kfree_skb_reason(segs, NULL, reason)) {
 			skb_poison_list(segs);
 			kfree_skb_add_bulk(segs, &sa, reason);
 		}
@@ -1405,7 +1408,7 @@ void consume_skb(struct sk_buff *skb)
 	if (!skb_unref(skb))
 		return;
 
-	trace_consume_skb(skb, __builtin_return_address(0));
+	call_trace_consume_skb(skb, NULL, __builtin_return_address(0));
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);
@@ -1420,7 +1423,7 @@ EXPORT_SYMBOL(consume_skb);
  */
 void __consume_stateless_skb(struct sk_buff *skb)
 {
-	trace_consume_skb(skb, __builtin_return_address(0));
+	call_trace_consume_skb(skb, NULL, __builtin_return_address(0));
 	skb_release_data(skb, SKB_CONSUMED);
 	kfree_skbmem(skb);
 }
@@ -1478,7 +1481,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 
 	/* if reaching here SKB is ready to free */
-	trace_consume_skb(skb, __builtin_return_address(0));
+	call_trace_consume_skb(skb, NULL, __builtin_return_address(0));
 
 	/* if SKB is a clone, don't handle this case */
 	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
-- 
2.30.2



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ