[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260107-skb-meta-safeproof-netdevs-rx-only-v3-16-0d461c5e4764@cloudflare.com>
Date: Wed, 07 Jan 2026 15:28:16 +0100
From: Jakub Sitnicki <jakub@...udflare.com>
To: bpf@...r.kernel.org
Cc: netdev@...r.kernel.org, "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>, Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Jesper Dangaard Brouer <hawk@...nel.org>,
John Fastabend <john.fastabend@...il.com>,
Stanislav Fomichev <sdf@...ichev.me>, Simon Horman <horms@...nel.org>,
Andrii Nakryiko <andrii@...nel.org>,
Martin KaFai Lau <martin.lau@...ux.dev>,
Eduard Zingerman <eddyz87@...il.com>, Song Liu <song@...nel.org>,
Yonghong Song <yonghong.song@...ux.dev>, KP Singh <kpsingh@...nel.org>,
Hao Luo <haoluo@...gle.com>, Jiri Olsa <jolsa@...nel.org>,
kernel-team@...udflare.com
Subject: [PATCH bpf-next v3 16/17] bpf: Realign skb metadata for TC progs
using data_meta
After decoupling metadata location from MAC header offset, a gap can appear
between metadata and skb->data on L2 decapsulation (e.g., VLAN, GRE). This
breaks the BPF data_meta pointer which assumes metadata is directly before
skb->data.
Introduce bpf_skb_meta_realign() kfunc to close the gap by moving metadata
to immediately precede the MAC header. Inject a call to it in
tc_cls_act_prologue() when the verifier detects data_meta access
(PA_F_DATA_META_LOAD flag).
Update skb_data_move() to handle the gap case: on skb_push(), move metadata
to the top of the head buffer; on skb_pull() where metadata is already
detached, leave it in place.
This restores data_meta functionality for TC programs while keeping the
performance benefit of avoiding memmove on L2 decapsulation for programs
that don't use data_meta.
Signed-off-by: Jakub Sitnicki <jakub@...udflare.com>
---
include/linux/skbuff.h | 25 +++++++++++++++++--------
net/core/filter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 61 insertions(+), 10 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6dd09f55a975..0fc4df42826e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4600,19 +4600,28 @@ static inline void skb_data_move(struct sk_buff *skb, const int len,
if (!meta_len)
goto no_metadata;
- meta_end = skb_metadata_end(skb);
- meta = meta_end - meta_len;
-
- if (WARN_ON_ONCE(meta_end + len != skb->data ||
- meta_len > skb_headroom(skb))) {
+ /* Not enough headroom left for metadata. Drop it. */
+ if (WARN_ONCE(meta_len > skb_headroom(skb),
+ "skb headroom smaller than metadata")) {
skb_metadata_clear(skb);
goto no_metadata;
}
- memmove(meta + len, meta, meta_len + n);
- skb_shinfo(skb)->meta_end += len;
- return;
+ meta_end = skb_metadata_end(skb);
+ meta = meta_end - meta_len;
+ /* Metadata in front of data before push/pull. Keep it that way. */
+ if (meta_end == skb->data - len) {
+ memmove(meta + len, meta, meta_len + n);
+ skb_shinfo(skb)->meta_end += len;
+ return;
+ }
+
+ if (len < 0) {
+ /* Data pushed. Move metadata to the top. */
+ memmove(skb->head, meta, meta_len);
+ skb_shinfo(skb)->meta_end = meta_len;
+ }
no_metadata:
memmove(skb->data, skb->data - len, n);
}
diff --git a/net/core/filter.c b/net/core/filter.c
index e91d5a39e0a7..df4c97fe79ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9136,11 +9136,53 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig,
return insn - insn_buf;
}
+static void bpf_skb_meta_realign(struct sk_buff *skb)
+{
+ u8 *meta_end = skb_metadata_end(skb);
+ u8 meta_len = skb_metadata_len(skb);
+ u8 *meta;
+ int gap;
+
+ gap = skb_mac_header(skb) - meta_end;
+ if (!meta_len || !gap)
+ return;
+
+ if (WARN_ONCE(gap < 0, "skb metadata end past mac header")) {
+ skb_metadata_clear(skb);
+ return;
+ }
+
+ meta = meta_end - meta_len;
+ memmove(meta + gap, meta, meta_len);
+ skb_shinfo(skb)->meta_end += gap;
+
+ bpf_compute_data_pointers(skb);
+}
+
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags,
const struct bpf_prog *prog)
{
- return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog,
- TC_ACT_SHOT);
+ struct bpf_insn *insn = insn_buf;
+ int cnt;
+
+ if (pkt_access_flags & PA_F_DATA_META_LOAD) {
+ /* Realign skb metadata for access through data_meta pointer.
+ *
+ * r6 = r1; // r6 will be "u64 *ctx"
+ * r0 = bpf_skb_meta_realign(r1); // r0 is undefined
+ * r1 = r6;
+ */
+ BUILD_BUG_ON(!__same_type(&bpf_skb_meta_realign,
+ (void (*)(struct sk_buff *))NULL));
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_EMIT_CALL(bpf_skb_meta_realign);
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ }
+ cnt = bpf_unclone_prologue(insn, pkt_access_flags, prog, TC_ACT_SHOT);
+ if (!cnt && insn > insn_buf)
+ *insn++ = prog->insnsi[0];
+
+ return cnt + insn - insn_buf;
}
static bool tc_cls_act_is_valid_access(int off, int size,
--
2.43.0
Powered by blists - more mailing lists