netdev - Re: [RFC bpf-next 3/7] bpf: implement devtx hook points

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKH8qBvfp7Do1tSD4YiiNVojG2gB9+mrNNLiVFz+ts4gU+pJrA@mail.gmail.com>
Date: Tue, 13 Jun 2023 12:00:25 -0700
From: Stanislav Fomichev <sdf@...gle.com>
To: Simon Horman <simon.horman@...igine.com>
Cc: bpf@...r.kernel.org, ast@...nel.org, daniel@...earbox.net, 
	andrii@...nel.org, martin.lau@...ux.dev, song@...nel.org, yhs@...com, 
	john.fastabend@...il.com, kpsingh@...nel.org, haoluo@...gle.com, 
	jolsa@...nel.org, netdev@...r.kernel.org
Subject: Re: [RFC bpf-next 3/7] bpf: implement devtx hook points

On Tue, Jun 13, 2023 at 8:08 AM Simon Horman <simon.horman@...igine.com> wrote:
>
> On Mon, Jun 12, 2023 at 10:23:03AM -0700, Stanislav Fomichev wrote:
> > devtx is a lightweight set of hooks before and after packet transmission.
> > The hook is supposed to work for both skb and xdp paths by exposing
> > a light-weight packet wrapper via devtx_frame (header portion + frags).
> >
> > devtx is implemented as a tracing program which has access to the
> > XDP-metadata-like kfuncs. The initial set of kfuncs is implemented
> > in the next patch, but the idea is similar to XDP metadata:
> > the kfuncs have netdev-specific implementation, but common
> > interface. Upon loading, the kfuncs are resolved to direct
> > calls against per-netdev implementation. This can be achieved
> > by marking devtx-tracing programs as dev-bound (largely
> > reusing xdp-dev-bound program infrastructure).
> >
> > Attachment and detachment is implemented via syscall BPF program
> > by calling bpf_devtx_sb_attach (attach to tx-submission)
> > or bpf_devtx_cp_attach (attach to tx completion). Right now,
> > the attachment does not return a link and doesn't support
> > multiple programs. I plan to switch to Daniel's bpf_mprog infra
> > once it's available.
> >
> > Cc: netdev@...r.kernel.org
> > Signed-off-by: Stanislav Fomichev <sdf@...gle.com>
>
> ...
>
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -22976,11 +22976,13 @@ L:  bpf@...r.kernel.org
> >  S:   Supported
> >  F:   drivers/net/ethernet/*/*/*/*/*xdp*
> >  F:   drivers/net/ethernet/*/*/*xdp*
> > +F:   include/net/devtx.h
> >  F:   include/net/xdp.h
> >  F:   include/net/xdp_priv.h
> >  F:   include/trace/events/xdp.h
> >  F:   kernel/bpf/cpumap.c
> >  F:   kernel/bpf/devmap.c
> > +F:   net/core/devtx.c
> >  F:   net/core/xdp.c
> >  F:   samples/bpf/xdp*
> >  F:   tools/testing/selftests/bpf/*/*xdp*
>
> Hi Stan,
>
> some feedback from my side.
>
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 08fbd4622ccf..e08e3fd39dfc 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2238,6 +2238,8 @@ struct net_device {
> >       unsigned int            real_num_rx_queues;
> >
> >       struct bpf_prog __rcu   *xdp_prog;
> > +     struct bpf_prog __rcu   *devtx_sb;
> > +     struct bpf_prog __rcu   *devtx_cp;
>
> It would be good to add these new fields to the kernel doc
> for struct net_device.

Sure, will do!

> >       unsigned long           gro_flush_timeout;
> >       int                     napi_defer_hard_irqs;
> >  #define GRO_LEGACY_MAX_SIZE  65536u
> > diff --git a/include/net/devtx.h b/include/net/devtx.h
> > new file mode 100644
> > index 000000000000..7eab66d0ce80
> > --- /dev/null
> > +++ b/include/net/devtx.h
> > @@ -0,0 +1,76 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __LINUX_NET_DEVTX_H__
> > +#define __LINUX_NET_DEVTX_H__
> > +
> > +#include <linux/jump_label.h>
> > +#include <linux/skbuff.h>
> > +#include <linux/netdevice.h>
> > +#include <net/xdp.h>
> > +
> > +struct devtx_frame {
> > +     void *data;
> > +     u16 len;
> > +     struct skb_shared_info *sinfo; /* for frags */
> > +};
> > +
> > +#ifdef CONFIG_NET
> > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx);
> > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx);
> > +bool is_devtx_kfunc(u32 kfunc_id);
> > +void devtx_shutdown(struct net_device *netdev);
> > +
> > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb)
> > +{
> > +     ctx->data = skb->data;
> > +     ctx->len = skb_headlen(skb);
> > +     ctx->sinfo = skb_shinfo(skb);
> > +}
> > +
> > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf)
> > +{
> > +     ctx->data = xdpf->data;
> > +     ctx->len = xdpf->len;
> > +     ctx->sinfo = xdp_frame_has_frags(xdpf) ? xdp_get_shared_info_from_frame(xdpf) : NULL;
> > +}
> > +
> > +DECLARE_STATIC_KEY_FALSE(devtx_enabled);
> > +
> > +static inline bool devtx_submit_enabled(struct net_device *netdev)
> > +{
> > +     return static_branch_unlikely(&devtx_enabled) &&
> > +            rcu_access_pointer(netdev->devtx_sb);
> > +}
> > +
> > +static inline bool devtx_complete_enabled(struct net_device *netdev)
> > +{
> > +     return static_branch_unlikely(&devtx_enabled) &&
> > +            rcu_access_pointer(netdev->devtx_cp);
> > +}
> > +#else
> > +static inline void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx)
> > +{
> > +}
> > +
> > +static inline void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx)
> > +{
> > +}
> > +
> > +static inline bool is_devtx_kfunc(u32 kfunc_id)
> > +{
> > +     return false;
> > +}
> > +
> > +static inline void devtx_shutdown(struct net_device *netdev)
> > +{
> > +}
> > +
> > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb)
> > +{
> > +}
> > +
> > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf)
> > +{
> > +}
> > +#endif
> > +
> > +#endif /* __LINUX_NET_DEVTX_H__ */
> > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > index 235d81f7e0ed..9cfe96422c80 100644
> > --- a/kernel/bpf/offload.c
> > +++ b/kernel/bpf/offload.c
> > @@ -25,6 +25,7 @@
> >  #include <linux/rhashtable.h>
> >  #include <linux/rtnetlink.h>
> >  #include <linux/rwsem.h>
> > +#include <net/devtx.h>
> >
> >  /* Protects offdevs, members of bpf_offload_netdev and offload members
> >   * of all progs.
> > @@ -228,6 +229,7 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
> >       int err;
> >
> >       if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
> > +         attr->prog_type != BPF_PROG_TYPE_TRACING &&
> >           attr->prog_type != BPF_PROG_TYPE_XDP)
> >               return -EINVAL;
> >
> > @@ -238,6 +240,10 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
> >           attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
> >               return -EINVAL;
> >
> > +     if (attr->prog_type == BPF_PROG_TYPE_TRACING &&
> > +         !is_devtx_kfunc(prog->aux->attach_btf_id))
> > +             return -EINVAL;
> > +
> >       netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
> >       if (!netdev)
> >               return -EINVAL;
> > diff --git a/net/core/Makefile b/net/core/Makefile
> > index 8f367813bc68..c1db05ccfac7 100644
> > --- a/net/core/Makefile
> > +++ b/net/core/Makefile
> > @@ -39,4 +39,5 @@ obj-$(CONFIG_FAILOVER) += failover.o
> >  obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
> >  obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
> >  obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
> > +obj-$(CONFIG_BPF_SYSCALL) += devtx.o
> >  obj-$(CONFIG_OF)     += of_net.o
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 3393c2f3dbe8..ef0e65e68024 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -150,6 +150,7 @@
> >  #include <linux/pm_runtime.h>
> >  #include <linux/prandom.h>
> >  #include <linux/once_lite.h>
> > +#include <net/devtx.h>
> >
> >  #include "dev.h"
> >  #include "net-sysfs.h"
> > @@ -10875,6 +10876,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
> >               dev_shutdown(dev);
> >
> >               dev_xdp_uninstall(dev);
> > +             devtx_shutdown(dev);
> >               bpf_dev_bound_netdev_unregister(dev);
> >
> >               netdev_offload_xstats_disable_all(dev);
> > diff --git a/net/core/devtx.c b/net/core/devtx.c
> > new file mode 100644
> > index 000000000000..b7cbc26d1c01
> > --- /dev/null
> > +++ b/net/core/devtx.c
> > @@ -0,0 +1,208 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +
> > +#include <net/devtx.h>
> > +#include <linux/filter.h>
> > +
> > +DEFINE_STATIC_KEY_FALSE(devtx_enabled);
> > +EXPORT_SYMBOL_GPL(devtx_enabled);
> > +
> > +static void devtx_run(struct net_device *netdev, struct devtx_frame *ctx, struct bpf_prog **pprog)
>
> Is an __rcu annotation appropriate for prog here?
> Also elsewhere in this patch.

Good point. Maybe I should rcu_dereference it them somewhere on top.
Let me try to find the best place..

> > +{
> > +     struct bpf_prog *prog;
> > +     void *real_ctx[1] = {ctx};
> > +
> > +     prog = rcu_dereference(*pprog);
> > +     if (likely(prog))
> > +             bpf_prog_run(prog, real_ctx);
> > +}
> > +
> > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx)
> > +{
> > +     rcu_read_lock();
> > +     devtx_run(netdev, ctx, &netdev->devtx_sb);
> > +     rcu_read_unlock();
> > +}
> > +EXPORT_SYMBOL_GPL(devtx_submit);
> > +
> > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx)
> > +{
> > +     rcu_read_lock();
> > +     devtx_run(netdev, ctx, &netdev->devtx_cp);
> > +     rcu_read_unlock();
> > +}
> > +EXPORT_SYMBOL_GPL(devtx_complete);
> > +
> > +/**
> > + * devtx_sb - Called for every egress netdev packet
>
> As this is a kernel doc, it would be good to document the ctx parameter here.

I didn't really find a convincing way to add a comment, I've had the
following which I've removed prio to submission:
@ctx devtx_frame context

But it doesn't seem like it brings anything useful? Or ok to keep it that way?

> > + *
> > + * Note: this function is never actually called by the kernel and declared
> > + * only to allow loading an attaching appropriate tracepoints.
> > + */
> > +__weak noinline void devtx_sb(struct devtx_frame *ctx)
>
> I guess this is intentional.
> But gcc complains that this is neither static nor is a forward
> declaration provided. Likewise for devtx_cp()

Copy-pasted from hid-bpf; let me see if they have a forward decl somewhere..

> > +{
> > +}
> > +
> > +/**
> > + * devtx_cp - Called upon egress netdev packet completion
>
> Likewise, here too.
>
> > + *
> > + * Note: this function is never actually called by the kernel and declared
> > + * only to allow loading an attaching appropriate tracepoints.
> > + */
> > +__weak noinline void devtx_cp(struct devtx_frame *ctx)
> > +{
> > +}
> > +
> > +BTF_SET8_START(bpf_devtx_hook_ids)
> > +BTF_ID_FLAGS(func, devtx_sb)
> > +BTF_ID_FLAGS(func, devtx_cp)
> > +BTF_SET8_END(bpf_devtx_hook_ids)
> > +
> > +static const struct btf_kfunc_id_set bpf_devtx_hook_set = {
> > +     .owner = THIS_MODULE,
> > +     .set   = &bpf_devtx_hook_ids,
> > +};
> > +
> > +static DEFINE_MUTEX(devtx_attach_lock);
> > +
> > +static int __bpf_devtx_detach(struct net_device *netdev, struct bpf_prog **pprog)
> > +{
>
> As per my prior comment about *prog and __rcu annotations.
> I'm genuinely unsure how the usage of **pprog in this function sits with RCU.

Yeah, I'm a bit sloppy, let me figure out a proper way to annotate it.


> > +     if (!*pprog)
> > +             return -EINVAL;
> > +     bpf_prog_put(*pprog);
> > +     *pprog = NULL;
> > +
> > +     static_branch_dec(&devtx_enabled);
> > +     return 0;
> > +}
> > +
> > +static int __bpf_devtx_attach(struct net_device *netdev, int prog_fd,
> > +                           const char *attach_func_name, struct bpf_prog **pprog)
> > +{
> > +     struct bpf_prog *prog;
> > +     int ret = 0;
> > +
> > +     if (prog_fd < 0)
> > +             return __bpf_devtx_detach(netdev, pprog);
> > +
> > +     if (*pprog)
> > +             return -EBUSY;
> > +
> > +     prog = bpf_prog_get(prog_fd);
> > +     if (IS_ERR(prog))
> > +             return PTR_ERR(prog);
> > +
> > +     if (prog->type != BPF_PROG_TYPE_TRACING ||
> > +         prog->expected_attach_type != BPF_TRACE_FENTRY ||
> > +         !bpf_prog_is_dev_bound(prog->aux) ||
> > +         !bpf_offload_dev_match(prog, netdev) ||
> > +         strcmp(prog->aux->attach_func_name, attach_func_name)) {
> > +             bpf_prog_put(prog);
> > +             return -EINVAL;
> > +     }
> > +
> > +     *pprog = prog;
> > +     static_branch_inc(&devtx_enabled);
> > +
> > +     return ret;
> > +}
>
> ...