[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20181126113857.29270-3-fw@strlen.de>
Date: Mon, 26 Nov 2018 12:38:56 +0100
From: Florian Westphal <fw@...len.de>
To: <netdev@...r.kernel.org>
Cc: Florian Westphal <fw@...len.de>
Subject: [RFC PATCH 2/3] sk_buff: add skb extension infrastructure
adds an extension infrastructure for sk_buff:
1. extension memory is released when the sk_buff is free'd.
2. data is shared after cloning an skb.
This is also how xfrm and bridge netfilter skb-associated data
(skb->sp and skb->nf_bridge) are handled.
Two new members are added to sk_buff:
1. 'active_extensions' byte (filling a hole), telling which extensions
have been allocated for the skb.
2. extension pointer, located at the end of the sk_buff.
If active_extensions is 0, its content is undefined.
The 'nf_bridge' pointer is removed, i.e. sk_buff size remains the same,
in a followup patch.
This adds extra code to skb clone and free paths (to deal with
refcount/free of extension area) but replaces the existing code that
deals with skb->nf_bridge.
This patch only adds the basic infrastructure, the nf_bridge conversion
is done in the next patch.
Conversion of skb->sp (ipsec/xfrm secpath) to an skb extension is planned
as a followup.
Signed-off-by: Florian Westphal <fw@...len.de>
---
include/linux/skbuff.h | 124 +++++++++++++++++++++++++++++++++++++-
net/Kconfig | 3 +
net/core/skbuff.c | 131 +++++++++++++++++++++++++++++++++++++++++
net/ipv4/ip_output.c | 1 +
net/ipv6/ip6_output.c | 1 +
5 files changed, 259 insertions(+), 1 deletion(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73902acf2b71..832904d71a85 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -245,6 +245,7 @@ struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
+struct skb_ext;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack {
@@ -633,6 +634,7 @@ typedef unsigned char *sk_buff_data_t;
* @queue_mapping: Queue mapping for multiqueue devices
* @xmit_more: More SKBs are pending for this queue
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
+ * @active_extensions: active extensions (skb_ext_id types)
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -662,6 +664,7 @@ typedef unsigned char *sk_buff_data_t;
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
+ * @extensions: allocated extensions, valid if active_extensions is nonzero
*/
struct sk_buff {
@@ -744,7 +747,9 @@ struct sk_buff {
head_frag:1,
xmit_more:1,
pfmemalloc:1;
-
+#ifdef CONFIG_SKB_EXTENSIONS
+ __u8 active_extensions;
+#endif
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
@@ -866,6 +871,11 @@ struct sk_buff {
*data;
unsigned int truesize;
refcount_t users;
+
+#ifdef CONFIG_SKB_EXTENSIONS
+ /* only useable after checking ->active_extensions != 0 */
+ struct skb_ext *extensions;
+#endif
};
#ifdef __KERNEL__
@@ -3889,6 +3899,118 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct)
atomic_inc(&nfct->use);
}
#endif
+
+#ifdef CONFIG_SKB_EXTENSIONS
+enum skb_ext_id {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ SKB_EXT_BRIDGE_NF,
+#endif
+ SKB_EXT_NUM, /* must be last */
+};
+
+/* each extension aligned to this value */
+#define SKB_EXT_ALIGN 8
+/* offsets/len: left-shift needed to translate offset to bytes */
+#define SKB_EXT_ALIGN_SHIFT 3
+
+/**
+ * struct skb_ext - sk_buff extensions
+ * @refcount: 1 on allocation, deallocated on 0
+ * @offset: offset to add to @data to obtain extension address
+ * @len: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
+ * @data: start of extension data, variable sized
+ *
+ * Note: offsets and len are stored in chunks of 8 bytes, this allows
+ * to use 'u8' types while allowing up to 2kb worth of extension data.
+ */
+struct skb_ext {
+ refcount_t refcnt;
+ u8 offset[SKB_EXT_NUM]; /* chunks of 8 bytes */
+ u8 len; /* same, i.e. size == len << 3 */
+ char data[0] __aligned(SKB_EXT_ALIGN);
+};
+
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_free(struct skb_ext *ext);
+
+static inline void __skb_ext_put(struct skb_ext *ext)
+{
+ if (ext && refcount_dec_and_test(&ext->refcnt))
+ __skb_ext_free(ext);
+}
+
+static inline void skb_ext_put(struct sk_buff *skb)
+{
+ if (skb->active_extensions)
+ __skb_ext_put(skb->extensions);
+}
+
+static inline void skb_ext_get(struct sk_buff *skb)
+{
+ if (skb->active_extensions) {
+ struct skb_ext *ext = skb->extensions;
+
+ if (ext)
+ refcount_inc(&ext->refcnt);
+ }
+}
+
+static inline void __skb_ext_copy(struct sk_buff *dst,
+ const struct sk_buff *src)
+{
+ dst->active_extensions = src->active_extensions;
+
+ if (src->active_extensions) {
+ struct skb_ext *ext = src->extensions;
+
+ if (ext)
+ refcount_inc(&ext->refcnt);
+ dst->extensions = ext;
+ }
+}
+
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
+{
+ skb_ext_put(dst);
+ __skb_ext_copy(dst, src);
+}
+
+static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
+{
+ return !!ext->offset[i];
+}
+
+static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
+{
+ return skb->active_extensions & (1 << id);
+}
+
+static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ if (skb_ext_exist(skb, id))
+ __skb_ext_del(skb, id);
+}
+
+static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
+{
+ if (skb_ext_exist(skb, id)) {
+ struct skb_ext *ext = skb->extensions;
+
+ if (ext && __skb_ext_exist(ext, id))
+ return (void *)ext + (ext->offset[id] << 3);
+ }
+
+ return NULL;
+}
+#else
+static inline void skb_ext_put(struct sk_buff *skb) {}
+static inline void skb_ext_get(struct sk_buff *skb) {}
+static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
+static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
+#endif /* CONFIG_SKB_EXTENSIONS */
+
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
{
diff --git a/net/Kconfig b/net/Kconfig
index f235edb593ba..93b291292860 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -51,6 +51,9 @@ config NET_INGRESS
config NET_EGRESS
bool
+config SKB_EXTENSIONS
+ bool
+
menu "Networking options"
source "net/packet/Kconfig"
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02cd7ae3d0fb..e29016030633 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -617,6 +617,7 @@ void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
#endif
+ skb_ext_put(skb);
}
/* Free everything but the sk_buff shell. */
@@ -796,6 +797,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->dev = old->dev;
memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
+ __skb_ext_copy(new, old);
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
@@ -5531,3 +5533,132 @@ void skb_condense(struct sk_buff *skb)
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ [SKB_EXT_BRIDGE_NF] = sizeof(struct nf_bridge_info),
+#endif
+};
+
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+ return (void *)ext + (ext->offset[id] << SKB_EXT_ALIGN_SHIFT);
+}
+
+static struct skb_ext *skb_ext_cow(unsigned int len,
+ struct skb_ext *old)
+{
+ struct skb_ext *new = kmalloc(len, GFP_ATOMIC);
+
+ if (!new)
+ return NULL;
+
+ if (!old) {
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ return new;
+ }
+
+ memcpy(new, old, old->len << SKB_EXT_ALIGN_SHIFT);
+ refcount_set(&new->refcnt, 1);
+ __skb_ext_put(old);
+ return new;
+}
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+ return 0 +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+ 0;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extenion or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+ unsigned int newlen, newoff, oldlen;
+ struct skb_ext *new, *old = NULL;
+ bool cow_needed = true;
+
+ BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(skb_ext_total_length() > (255 << 3));
+
+ if (skb->active_extensions) {
+ old = skb->extensions;
+
+ cow_needed = refcount_read(&old->refcnt) > 1;
+
+ if (__skb_ext_exist(old, id)) {
+ if (!cow_needed) {
+ new = old;
+ goto set_active;
+ }
+
+ /* extension was allocated previously and it
+ * might be used by a cloned skb. COW needed.
+ */
+ new = skb_ext_cow(old->len << SKB_EXT_ALIGN_SHIFT, old);
+ if (!new)
+ return NULL;
+
+ skb->extensions = new;
+ goto set_active;
+ }
+ oldlen = old->len << SKB_EXT_ALIGN_SHIFT;
+ } else {
+ oldlen = sizeof(*new);
+ }
+
+ newoff = ALIGN(oldlen, SKB_EXT_ALIGN);
+ newlen = newoff + skb_ext_type_len[id];
+
+ if (cow_needed)
+ new = skb_ext_cow(newlen, old);
+ else
+ new = krealloc(old, newlen, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ new->offset[id] = newoff >> SKB_EXT_ALIGN_SHIFT;
+ new->len = newlen >> SKB_EXT_ALIGN_SHIFT;
+ skb->extensions = new;
+set_active:
+ skb->active_extensions |= 1 << id;
+ return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *ext;
+
+ skb->active_extensions &= ~(1 << id);
+ if (skb->active_extensions == 0) {
+ ext = skb->extensions;
+ skb->extensions = NULL;
+ __skb_ext_put(ext);
+ }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_free(struct skb_ext *ext)
+{
+ kfree(ext);
+}
+EXPORT_SYMBOL(__skb_ext_free);
+#endif /* CONFIG_SKB_EXTENSIONS */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..a12e12f983d5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89e0d5118afe..7eeb0f24be87 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -574,6 +574,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
skb_copy_secmark(to, from);
}
--
2.18.1
Powered by blists - more mailing lists