[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220121073045.4179438-1-kafai@fb.com>
Date: Thu, 20 Jan 2022 23:30:45 -0800
From: Martin KaFai Lau <kafai@...com>
To: <bpf@...r.kernel.org>, <netdev@...r.kernel.org>
CC: Alexei Starovoitov <ast@...nel.org>,
Andrii Nakryiko <andrii@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
David Miller <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, <kernel-team@...com>,
Willem de Bruijn <willemb@...gle.com>
Subject: [RFC PATCH v3 net-next 3/4] net: Set skb->mono_delivery_time and clear it when delivering locally
This patch sets the skb->mono_delivery_time to flag the skb->tstamp
is used as the mono delivery_time (EDT) instead of the (rcv) timestamp.
skb_clear_delivery_time() is added to clear the delivery_time and set
back to the (rcv) timestamp if needed when the skb is being delivered
locally (to a sk). skb_clear_delivery_time() is called in
ip_local_deliver() and ip6_input(). In most of the regular ingress
cases, the skb->tstamp should already have the (rcv) timestamp.
For the egress loop back to ingress cases, the marking of the (rcv)
timestamp is postponed from dev.c to ip_local_deliver() and
ip6_input().
Another case needs to clear the delivery_time is the network
tapping (e.g. af_packet by tcpdump). Regardless of tapping at the ingress
or egress, the tapped skb is received by the af_packet socket, so
it is ingress to the af_packet socket and it expects
the (rcv) timestamp.
When tapping at egress, dev_queue_xmit_nit() is used. It has already
expected skb->tstamp may have delivery_time, so it does
skb_clone()+net_timestamp_set() to ensure the cloned skb has
the (rcv) timestamp before passing to the af_packet sk.
This patch only adds to clear the skb->mono_delivery_time
bit in net_timestamp_set().
When tapping at ingress, it currently expects the skb->tstamp is either 0
or has the (rcv) timestamp. Meaning, the tapping at ingress path
has already expected the skb->tstamp could be 0 and it will get
the (rcv) timestamp by ktime_get_real() when needed.
There are two cases for tapping at ingress:
One case is af_packet queues the skb to its sk_receive_queue. The skb
is either not shared or new clone created. The skb_clear_delivery_time()
is called to clear the delivery_time (if any) before it is queued to the
sk_receive_queue.
Another case, the ingress skb is directly copied to the rx_ring
and tpacket_get_timestamp() is used to get the (rcv) timestamp.
skb_tstamp() is used in tpacket_get_timestamp() to check
the skb->mono_delivery_time bit before returning skb->tstamp.
As mentioned earlier, the tapping@...ress has already expected
the skb may not have the (rcv) timestamp (because no sk has asked
for it) and has handled this case by directly calling ktime_get_real().
In __skb_tstamp_tx, it clones the egress skb and queues the clone to the
sk_error_queue. The outgoing skb may have the mono delivery_time while
the (rcv) timestamp is expected for the clone, so the
skb->mono_delivery_time bit is also cleared from the clone.
Signed-off-by: Martin KaFai Lau <kafai@...com>
---
include/linux/skbuff.h | 27 +++++++++++++++++++++++++--
net/core/dev.c | 4 +++-
net/core/skbuff.c | 6 ++++--
net/ipv4/ip_input.c | 1 +
net/ipv6/ip6_input.c | 1 +
net/packet/af_packet.c | 4 +++-
6 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8de555513b94..4677bb6c7279 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3912,8 +3912,23 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
bool mono)
{
skb->tstamp = kt;
- /* Setting mono_delivery_time will be enabled later */
- /* skb->mono_delivery_time = kt && mono; */
+ skb->mono_delivery_time = kt && mono;
+}
+
+DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
+
+/* skb is delivering locally. If needed, set it to the (rcv) timestamp.
+ * Otherwise, clear the delivery time.
+ */
+static inline void skb_clear_delivery_time(struct sk_buff *skb)
+{
+ if (unlikely(skb->mono_delivery_time)) {
+ skb->mono_delivery_time = 0;
+ if (static_branch_unlikely(&netstamp_needed_key))
+ skb->tstamp = ktime_get_real();
+ else
+ skb->tstamp = 0;
+ }
}
static inline void skb_clear_tstamp(struct sk_buff *skb)
@@ -3924,6 +3939,14 @@ static inline void skb_clear_tstamp(struct sk_buff *skb)
skb->tstamp = 0;
}
+static inline ktime_t skb_tstamp(const struct sk_buff *skb)
+{
+ if (unlikely(skb->mono_delivery_time))
+ return 0;
+
+ return skb->tstamp;
+}
+
static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
return skb_shinfo(skb)->meta_len;
diff --git a/net/core/dev.c b/net/core/dev.c
index 84a0d9542fe9..b4b392f5ef9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2000,7 +2000,8 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -2061,6 +2062,7 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
if (static_branch_unlikely(&netstamp_needed_key))
__net_timestamp(skb);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3e3da8fdf8f5..93dc763da8cb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4817,10 +4817,12 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
}
- if (hwtstamps)
+ if (hwtstamps) {
*skb_hwtstamps(skb) = *hwtstamps;
- else
+ } else {
skb->tstamp = ktime_get_real();
+ skb->mono_delivery_time = 0;
+ }
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3a025c011971..35311ca75496 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -244,6 +244,7 @@ int ip_local_deliver(struct sk_buff *skb)
*/
struct net *net = dev_net(skb->dev);
+ skb_clear_delivery_time(skb);
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 80256717868e..84f93864b774 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -469,6 +469,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
int ip6_input(struct sk_buff *skb)
{
+ skb_clear_delivery_time(skb);
return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
ip6_input_finish);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5bd409ab4cc2..ab55adff3500 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -460,7 +460,7 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
return TP_STATUS_TS_RAW_HARDWARE;
if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
- ktime_to_timespec64_cond(skb->tstamp, ts))
+ ktime_to_timespec64_cond(skb_tstamp(skb), ts))
return TP_STATUS_TS_SOFTWARE;
return 0;
@@ -2195,6 +2195,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
sock_skb_set_dropcount(sk, skb);
+ skb_clear_delivery_time(skb);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
@@ -2373,6 +2374,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
po->stats.stats1.tp_packets++;
if (copy_skb) {
status |= TP_STATUS_COPY;
+ skb_clear_delivery_time(copy_skb);
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
}
spin_unlock(&sk->sk_receive_queue.lock);
--
2.30.2
Powered by blists - more mailing lists