[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2b2c3eb1-df87-40fe-b871-b52812c8ecd0@linux.dev>
Date: Thu, 25 Apr 2024 21:37:36 -0700
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Abhishek Chauhan <quic_abchauha@...cinc.com>
Cc: "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org,
linux-kernel@...r.kernel.org, Andrew Halaney <ahalaney@...hat.com>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Martin KaFai Lau <martin.lau@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>, bpf <bpf@...r.kernel.org>,
kernel@...cinc.com
Subject: Re: [RFC PATCH bpf-next v5 2/2] net: Add additional bit to support
clockid_t timestamp type
On 4/24/24 3:20 PM, Abhishek Chauhan wrote:
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index e464d0ebc9c1..3ad0de07d261 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -711,6 +711,8 @@ typedef unsigned char *sk_buff_data_t;
> enum skb_tstamp_type {
> SKB_CLOCK_REALTIME,
> SKB_CLOCK_MONOTONIC,
> + SKB_CLOCK_TAI,
> + __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
> };
>
> /**
> @@ -831,8 +833,8 @@ enum skb_tstamp_type {
> * @decrypted: Decrypted SKB
> * @slow_gro: state present at GRO time, slower prepare step required
> * @tstamp_type: When set, skb->tstamp has the
> - * delivery_time in mono clock base Otherwise, the
> - * timestamp is considered real clock base.
> + * delivery_time in mono clock base or clock base of skb->tstamp.
> + * Otherwise, the timestamp is considered real clock base
> * @napi_id: id of the NAPI struct this skb came from
> * @sender_cpu: (aka @napi_id) source CPU in XPS
> * @alloc_cpu: CPU which did the skb allocation.
> @@ -960,7 +962,7 @@ struct sk_buff {
> /* private: */
> __u8 __mono_tc_offset[0];
> /* public: */
> - __u8 tstamp_type:1; /* See skb_tstamp_type */
> + __u8 tstamp_type:2; /* See skb_tstamp_type */
> #ifdef CONFIG_NET_XGRESS
> __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
> __u8 tc_skip_classify:1;
> @@ -1090,15 +1092,17 @@ struct sk_buff {
> #endif
> #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset)
>
> -/* if you move tc_at_ingress or mono_delivery_time
> +/* if you move tc_at_ingress or tstamp_type:2
> * around, you also must adapt these constants.
> */
> #ifdef __BIG_ENDIAN_BITFIELD
> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7)
> -#define TC_AT_INGRESS_MASK (1 << 6)
> +#define SKB_TSTAMP_TYPE_MASK (3 << 6)
> +#define SKB_TSTAMP_TYPE_RSH (6)
> +#define TC_AT_INGRESS_RSH (5)
TC_AT_INGRESS_RSH is not used.
> +#define TC_AT_INGRESS_MASK (1 << 5)
> #else
> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0)
> -#define TC_AT_INGRESS_MASK (1 << 1)
> +#define SKB_TSTAMP_TYPE_MASK (3)
> +#define TC_AT_INGRESS_MASK (1 << 2)
> #endif
> #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset)
>
> @@ -4204,6 +4208,12 @@ static inline void skb_set_tstamp_type_frm_clkid(struct sk_buff *skb,
> case CLOCK_MONOTONIC:
> skb->tstamp_type = SKB_CLOCK_MONOTONIC;
> break;
> + case CLOCK_TAI:
> + skb->tstamp_type = SKB_CLOCK_TAI;
> + break;
> + default:
> + WARN_ONCE(true, "clockid %d not supported", clockid);
> + skb->tstamp_type = SKB_CLOCK_REALTIME;
> }
> }
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index cee0a7915c08..1376ed5ece10 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
The bpf.h needs to be sync to tools/include/uapi/linux/bpf.h.
Otherwise, the bpf CI cannot compile the tests:
https://patchwork.kernel.org/project/netdevbpf/patch/20240424222028.1080134-2-quic_abchauha@quicinc.com/
Please monitor the bpf CI test result after submitting the patches.
> @@ -6209,6 +6209,7 @@ union { \
> enum {
> BPF_SKB_TSTAMP_UNSPEC,
> BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */
> + BPF_SKB_TSTAMP_DELIVERY_TAI, /* tstamp has tai delivery time */
> /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
> * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
> * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
SKB_CLOCK_TAI is properly defined as an enum now and there is a
WARN for clock other than REAL, MONO, and TAI. I think it is
time to remove UNSPEC and give it back the proper name REALTIME.
I want to take this chance to do some renaming:
/* The enum used in skb->tstamp_type. It specifies the clock type
* of the time stored in the skb->tstamp.
*/
enum {
BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */
BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */
BPF_SKB_CLOCK_REALTIME = 0, /* Realtime clock */
BPF_SKB_CLOCK_MONOTONIC = 1, /* Monotonic clock */
BPF_SKB_CLOCK_TAI = 2, /* TAI clock */
/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
* the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
*/
};
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 957c2fc724eb..c67622f4fe98 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -7733,6 +7733,12 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
> skb->tstamp = tstamp;
> skb->tstamp_type = SKB_CLOCK_MONOTONIC;
> break;
> + case BPF_SKB_TSTAMP_DELIVERY_TAI:
> + if (!tstamp)
> + return -EINVAL;
> + skb->tstamp = tstamp;
> + skb->tstamp_type = SKB_CLOCK_TAI;
> + break;
> case BPF_SKB_TSTAMP_UNSPEC:
> if (tstamp)
Allow to store any realtime tstamp here since BPF_SKB_TSTAMP_UNSPEC
becomes BPF_SKB_CLOCK_REALTIME.
Like:
BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
u64, tstamp, u32, tstamp_type)
{
/* ... */
case BPF_SKB_CLOCK_TAI:
if (!tstamp)
return -EINVAL;
skb->tstamp = tstamp;
skb->tstamp_type = SKB_CLOCK_TAI;
break;
case BPF_SKB_CLOCK_REALTIME:
skb->tstamp = tstamp;
skb->tstamp_type = SKB_CLOCK_REALTIME;
break;
/* ... */
}
> return -EINVAL;
> @@ -9388,17 +9394,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
> {
> __u8 value_reg = si->dst_reg;
> __u8 skb_reg = si->src_reg;
> - /* AX is needed because src_reg and dst_reg could be the same */
> - __u8 tmp_reg = BPF_REG_AX;
> -
> - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
> - SKB_BF_MONO_TC_OFFSET);
> - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
> - SKB_MONO_DELIVERY_TIME_MASK, 2);
> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
> - *insn++ = BPF_JMP_A(1);
> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
> -
> + BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
Add these also:
BUILD_BUG_ON(SKB_CLOCK_REALTIME != BPF_SKB_CLOCK_REALTIME);
BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != BPF_SKB_CLOCK_MONOTONIC);
BUILD_BUG_ON(SKB_CLOCK_TAI != BPF_SKB_CLOCK_TAI);
> + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
> + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
> +#ifdef __BIG_ENDIAN_BITFIELD
> + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSH);
> +#else
> + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
> +#endif
> + *insn++ = BPF_JMP32_IMM(BPF_JNE, value_reg, SKB_TSTAMP_TYPE_MASK, 1);
> + /* Both the bits set then mark it BPF_SKB_TSTAMP_UNSPEC */
> + *insn++ = BPF_MOV64_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
The kernel should not have both bits set in skb->tstamp_type. No need to
add two extra bpf insns to check this. If there is a bug in the kernel,
it is better to be uncovered instead of hiding it under BPF_SKB_TSTAMP_UNSPEC (which
is renamed to BPF_SKB_CLOCK_REALTIME anyway).
Hence, the last two bpf insns should be removed.
> return insn;
> }
>
> @@ -9430,6 +9436,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
> __u8 value_reg = si->dst_reg;
> __u8 skb_reg = si->src_reg;
>
> +BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
It is a dup of the one in bpf_convert_tstamp_type_read and can be removed.
> #ifdef CONFIG_NET_XGRESS
> /* If the tstamp_type is read,
> * the bpf prog is aware the tstamp could have delivery time.
> @@ -9440,11 +9447,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
> __u8 tmp_reg = BPF_REG_AX;
>
> *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
> - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
> - /* skb->tc_at_ingress && skb->tstamp_type:1,
> + /* check if ingress mask bits is set */
> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
> + *insn++ = BPF_JMP_A(4);
> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
> + *insn++ = BPF_JMP_A(2);
> + /* skb->tc_at_ingress && skb->tstamp_type:2,
> * read 0 as the (rcv) timestamp.
> */
> *insn++ = BPF_MOV64_IMM(value_reg, 0);
> @@ -9469,7 +9477,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
> * the bpf prog is aware the tstamp could have delivery time.
> * Thus, write skb->tstamp as is if tstamp_type_access is true.
> * Otherwise, writing at ingress will have to clear the
> - * mono_delivery_time (skb->tstamp_type:1)bit also.
> + * mono_delivery_time (skb->tstamp_type:2)bit also.
> */
> if (!prog->tstamp_type_access) {
> __u8 tmp_reg = BPF_REG_AX;
> @@ -9479,8 +9487,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
> *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
> /* goto <store> */
> *insn++ = BPF_JMP_A(2);
> - /* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
> + /* <clear>: skb->tstamp_type:2 */
> + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
> *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
> }
> #endif
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 591226dcde26..f195b31d6e75 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1457,7 +1457,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
>
> skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
> skb->mark = cork->mark;
> - skb->tstamp = cork->transmit_time;
> + skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);
hmm... I think this will break for tcp. This sequence in particular:
tcp_v4_timewait_ack()
tcp_v4_send_ack()
ip_send_unicast_reply()
ip_push_pending_frames()
ip_finish_skb()
__ip_make_skb()
/* sk_clockid is REAL but cork->transmit_time should be in mono */
skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);;
I think I hit it from time to time when running the test in this patch set.
[ ... ]
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
> index 74ec09f040b7..19dba6d88265 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
Please separate the selftests/bpf changes into another patch.
> @@ -227,6 +227,12 @@ int egress_host(struct __sk_buff *skb)
> inc_dtimes(EGRESS_ENDHOST);
> else
> inc_errs(EGRESS_ENDHOST);
> + } else if (skb_proto(skb_type) == IPPROTO_UDP) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
> + skb->tstamp)
> + inc_dtimes(EGRESS_ENDHOST);
> + else
> + inc_errs(EGRESS_ENDHOST);
> } else {
> if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
> skb->tstamp)
> @@ -255,6 +261,9 @@ int ingress_host(struct __sk_buff *skb)
> if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
> skb->tstamp == EGRESS_FWDNS_MAGIC)
> inc_dtimes(INGRESS_ENDHOST);
> + else if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
> + skb->tstamp == EGRESS_FWDNS_MAGIC)
> + inc_dtimes(INGRESS_ENDHOST);
> else
> inc_errs(INGRESS_ENDHOST);
>
> @@ -323,12 +332,14 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
> /* Should have handled in prio100 */
> return TC_ACT_SHOT;
>
> - if (skb_proto(skb_type) == IPPROTO_UDP)
> + if (skb_proto(skb_type) == IPPROTO_UDP &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI)
> expected_dtime = 0;
The IPPROTO_UDP check and expected_dtime can be removed. The UDP test
can expect the same EGRESS_ENDHOST_MAGIC in the skb->tstamp since
the TAI tstamp is also forwarded from egress to ingress now.
>
> if (skb->tstamp_type) {
> if (fwdns_clear_dtime() ||
> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
> skb->tstamp != expected_dtime)
> inc_errs(INGRESS_FWDNS_P101);
> else
> @@ -338,7 +349,8 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
> inc_errs(INGRESS_FWDNS_P101);
> }
>
> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
No need to check BPF_SKB_TSTAMP_DELIVERY_TAI such that the
bpf_skb_set_tstamp() helper can still be tested.
There are some other minor changes needed for the test_tc_dtime.c and
the tc_redirect.c. I quickly made the changes and put them here (first patch):
https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=skb.tstamp_type
> skb->tstamp = INGRESS_FWDNS_MAGIC;
> } else {
> if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
> @@ -370,7 +382,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
>
> if (skb->tstamp_type) {
> if (fwdns_clear_dtime() ||
> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
> skb->tstamp != INGRESS_FWDNS_MAGIC)
> inc_errs(EGRESS_FWDNS_P101);
> else
> @@ -380,7 +393,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
> inc_errs(EGRESS_FWDNS_P101);
> }
>
> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
> skb->tstamp = EGRESS_FWDNS_MAGIC;
> } else {
> if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
Powered by blists - more mailing lists