[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230215034444.482178-1-kuba@kernel.org>
Date: Tue, 14 Feb 2023 19:44:44 -0800
From: Jakub Kicinski <kuba@...nel.org>
To: davem@...emloft.net
Cc: netdev@...r.kernel.org, edumazet@...gle.com, pabeni@...hat.com,
willemb@...gle.com, fw@...len.de, Jakub Kicinski <kuba@...nel.org>
Subject: [RFC] net: skbuff: let struct skb_ext live inside the head
This is a bit more crazy than the previous patch. For drivers
which already use build_skb() it's relatively easy to add more
space to the shinfo. Use this approach to place skb_ext inside
the head. No allocation needed.
This approach is a bit slower in trivial benchmarks than the recycling
because it requires extra cache line accesses (12.1% loss, ->18.6Gbps).
In-place skb_ext may be shorter than a full skb_ext object.
The driver only reserves space for exts it may use.
Any later addition will reallocate the space via CoW,
abandoning the in-place skb_ext and copying the data to
a full slab object.
Signed-off-by: Jakub Kicinski <kuba@...nel.org>
---
include/linux/skbuff.h | 29 +++++++-
net/core/gro.c | 1 +
net/core/skbuff.c | 156 +++++++++++++++++++++++++++++++++++++++--
3 files changed, 178 insertions(+), 8 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e68cb0a777b9..86486b667c94 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1257,6 +1257,8 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size);
void skb_attempt_defer_free(struct sk_buff *skb);
+struct sk_buff *napi_build_skb_ext(void *data, unsigned int frag_size,
+ unsigned int ext_size);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
struct sk_buff *slab_build_skb(void *data);
@@ -4601,6 +4603,16 @@ enum skb_ext_id {
SKB_EXT_NUM, /* must be last */
};
+enum skb_ext_ref {
+ /* Normal skb ext from the kmem_cache */
+ SKB_EXT_ALLOC_SLAB,
+ /* Shard is a read-only skb ext placed after shinfo in the head,
+ * shards may be shorter than full skb_ext length!
+ */
+ SKB_EXT_ALLOC_SHARD_NOREF,
+ SKB_EXT_ALLOC_SHARD_REF,
+};
+
/**
* struct skb_ext - sk_buff extensions
* @refcnt: 1 on allocation, deallocated on 0
@@ -4613,6 +4625,7 @@ enum skb_ext_id {
*/
struct skb_ext {
refcount_t refcnt;
+ u8 alloc_type;
u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
u8 chunks; /* same */
char data[] __aligned(8);
@@ -4623,8 +4636,10 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void *napi_skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void *skb_ext_add_inplace(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);
+void __skb_ext_copy_get(struct skb_ext *ext);
static inline void skb_ext_put(struct sk_buff *skb)
{
@@ -4640,7 +4655,7 @@ static inline void __skb_ext_copy(struct sk_buff *dst,
if (src->active_extensions) {
struct skb_ext *ext = src->extensions;
- refcount_inc(&ext->refcnt);
+ __skb_ext_copy_get(ext);
dst->extensions = ext;
}
}
@@ -4699,6 +4714,18 @@ static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */
+/* Head got merged into another skb, skb itself will be freed later by
+ * kfree_skb_partial() or napi_skb_free_stolen_head().
+ */
+static inline void skb_head_stolen(struct sk_buff *skb)
+{
+#ifdef CONFIG_SKB_EXTENSIONS
+ if (unlikely(skb->active_extensions) &&
+ skb->extensions->alloc_type == SKB_EXT_ALLOC_SHARD_NOREF)
+ skb->active_extensions = 0;
+#endif
+}
+
static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/core/gro.c b/net/core/gro.c
index a606705a0859..f1e091fa0d4c 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -250,6 +250,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
delta_truesize = skb->truesize - new_truesize;
skb->truesize = new_truesize;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ skb_head_stolen(skb);
goto done;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index feb5034b13ad..19e16d7533ac 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -128,6 +128,8 @@ const char * const drop_reasons[] = {
};
EXPORT_SYMBOL(drop_reasons);
+static int skb_ext_evacuate_head(struct sk_buff *skb);
+
/**
* skb_panic - private function for out-of-line support
* @skb: buffer
@@ -502,6 +504,35 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(napi_build_skb);
+/**
+ * napi_build_skb_ext - build a network buffer with in-place skb_ext
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ * @ext_size: size of space to reserve for skb_ext
+ *
+ * Version of napi_build_skb() that reserves space in the head for
+ * struct skb_ext. If @ext_size is zero or skb_ext_add_inplace()
+ * is never called there should be no performance loss compared
+ * to napi_build_skb().
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *
+napi_build_skb_ext(void *data, unsigned int frag_size, unsigned int ext_size)
+{
+ struct sk_buff *skb;
+
+ skb = __napi_build_skb(data, frag_size - ext_size);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+
+ return skb;
+}
+EXPORT_SYMBOL(napi_build_skb_ext);
+
/*
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
* the caller if emergency pfmemalloc reserves are being used. If it is and
@@ -1257,7 +1288,9 @@ static void napi_skb_ext_put(struct sk_buff *skb)
if (!skb_ext_needs_destruct(ext)) {
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- if (refcount_read(&ext->refcnt) == 1 && !nc->ext) {
+ if (refcount_read(&ext->refcnt) == 1 &&
+ ext->alloc_type == SKB_EXT_ALLOC_SLAB &&
+ !nc->ext) {
kasan_poison_object_data(skbuff_ext_cache, ext);
nc->ext = ext;
return;
@@ -2028,6 +2061,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
+ if (skb_ext_evacuate_head(skb))
+ return -ENOMEM;
+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
@@ -5666,6 +5702,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
skb_fill_page_desc(to, to_shinfo->nr_frags,
page, offset, skb_headlen(from));
+ skb_head_stolen(from);
*fragstolen = true;
} else {
if (to_shinfo->nr_frags +
@@ -6389,6 +6426,9 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
+ if (skb_ext_evacuate_head(skb))
+ return -ENOMEM;
+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
@@ -6505,6 +6545,9 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
+ if (skb_ext_evacuate_head(skb))
+ return -ENOMEM;
+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
@@ -6639,10 +6682,11 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}
-static void skb_ext_init(struct skb_ext *new)
+static void skb_ext_init_slab(struct skb_ext *new)
{
memset(new->offset, 0, sizeof(new->offset));
refcount_set(&new->refcnt, 1);
+ new->alloc_type = SKB_EXT_ALLOC_SLAB;
}
/**
@@ -6659,7 +6703,7 @@ struct skb_ext *__skb_ext_alloc(gfp_t flags)
struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
if (new)
- skb_ext_init(new);
+ skb_ext_init_slab(new);
return new;
}
@@ -6669,7 +6713,8 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
{
struct skb_ext *new;
- if (refcount_read(&old->refcnt) == 1)
+ if (refcount_read(&old->refcnt) == 1 &&
+ old->alloc_type == SKB_EXT_ALLOC_SLAB)
return old;
new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
@@ -6678,6 +6723,7 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
refcount_set(&new->refcnt, 1);
+ new->alloc_type = SKB_EXT_ALLOC_SLAB;
#ifdef CONFIG_XFRM
if (old_active & (1 << SKB_EXT_SEC_PATH)) {
@@ -6793,13 +6839,41 @@ void *napi_skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
return NULL;
}
- skb_ext_init(new);
+ skb_ext_init_slab(new);
}
return skb_ext_add_finalize(skb, id, new);
}
EXPORT_SYMBOL(napi_skb_ext_add);
+/**
+ * skb_ext_add_inplace - allocate ext space in napi_build_skb_ext() skb
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Creates an extension in a private skb allocated with napi_build_skb_ext().
+ * The caller must have allocated the @skb and must guarantee that there
+ * will be enough space for all the extensions.
+ *
+ * Returns pointer to the extension or NULL on failure.
+ */
+void *skb_ext_add_inplace(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *new = NULL;
+
+ if (!skb->active_extensions) {
+ new = (void *)&skb_shinfo(skb)[1];
+ skb->extensions = new;
+
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ new->alloc_type = SKB_EXT_ALLOC_SHARD_NOREF;
+ }
+
+ return skb_ext_add_finalize(skb, id, new);
+}
+EXPORT_SYMBOL(skb_ext_add_inplace);
+
#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
@@ -6846,8 +6920,11 @@ void __skb_ext_put(struct skb_ext *ext)
if (refcount_read(&ext->refcnt) == 1)
goto free_now;
- if (!refcount_dec_and_test(&ext->refcnt))
+ if (!refcount_dec_and_test(&ext->refcnt)) {
+ if (ext->alloc_type == SKB_EXT_ALLOC_SHARD_REF)
+ goto free_shard;
return;
+ }
free_now:
#ifdef CONFIG_XFRM
if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
@@ -6858,9 +6935,74 @@ void __skb_ext_put(struct skb_ext *ext)
skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
#endif
- kmem_cache_free(skbuff_ext_cache, ext);
+ switch (ext->alloc_type) {
+ case SKB_EXT_ALLOC_SLAB:
+ kmem_cache_free(skbuff_ext_cache, ext);
+ break;
+ case SKB_EXT_ALLOC_SHARD_NOREF:
+ break;
+ case SKB_EXT_ALLOC_SHARD_REF:
+free_shard:
+ skb_free_frag(ext);
+ break;
+ }
}
EXPORT_SYMBOL(__skb_ext_put);
+
+/* Only safe to use as part of a copy operation */
+void __skb_ext_copy_get(struct skb_ext *ext)
+{
+ struct page *head_page;
+ unsigned int type;
+ int old;
+
+ __refcount_inc(&ext->refcnt, &old);
+
+ type = READ_ONCE(ext->alloc_type);
+ if (type == SKB_EXT_ALLOC_SLAB)
+ return;
+
+ head_page = virt_to_head_page(ext);
+ get_page(head_page);
+
+ /* First reference to a shard does not hold a reference to
+ * the underlying page, take it now. This function can only
+ * be called during copy, so caller has a reference on ext,
+ * we just took the second one - there is no risk that two
+ * callers will race to do this upgrade.
+ */
+ if (type == SKB_EXT_ALLOC_SHARD_NOREF && old == 1) {
+ get_page(head_page);
+ WRITE_ONCE(ext->alloc_type, SKB_EXT_ALLOC_SHARD_REF);
+ }
+}
+EXPORT_SYMBOL(__skb_ext_copy_get);
+
+static int skb_ext_evacuate_head(struct sk_buff *skb)
+{
+ struct skb_ext *old, *new;
+
+ if (likely(!skb->active_extensions))
+ return 0;
+ old = skb->extensions;
+ if (old->alloc_type != SKB_EXT_ALLOC_SHARD_NOREF)
+ return 0;
+
+ WARN_ON_ONCE(!skb->head_frag);
+ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+ WARN_ON_ONCE(refcount_read(&old->refcnt) != 1);
+ refcount_set(&new->refcnt, 1);
+ new->alloc_type = SKB_EXT_ALLOC_SLAB;
+
+ skb->extensions = new;
+ /* We're dealing with NOREF and we copied contents, so no freeing */
+
+ return 0;
+}
#endif /* CONFIG_SKB_EXTENSIONS */
/**
--
2.39.1
Powered by blists - more mailing lists