[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200527205300.GC57268@google.com>
Date: Wed, 27 May 2020 13:53:00 -0700
From: sdf@...gle.com
To: Jakub Sitnicki <jakub@...udflare.com>
Cc: bpf@...r.kernel.org, netdev@...r.kernel.org,
kernel-team@...udflare.com
Subject: Re: [PATCH bpf-next 5/8] bpf: Add link-based BPF program attachment
to network namespace
On 05/27, Jakub Sitnicki wrote:
> Add support for bpf() syscall subcommands that operate on
> bpf_link (LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO) for attach points tied
> to
> network namespaces (that is flow dissector at the moment).
> Link-based and prog-based attachment can be used interchangeably, but only
> one can be in use at a time. Attempts to attach a link when a prog is
> already attached directly, and the other way around, will be met with
> -EBUSY.
> Attachment of multiple links of same attach type to one netns is not
> supported, with the intention to lift it when a use-case presents
> itself. Because of that attempts to create a netns link, when one already
> exists result in -E2BIG error, signifying that there is no space left for
> another attachment.
> Link-based attachments to netns don't keep a netns alive by holding a ref
> to it. Instead links get auto-detached from netns when the latter is being
> destroyed by a pernet pre_exit callback.
> When auto-detached, link lives in defunct state as long there are open FDs
> for it. -ENOLINK is returned if a user tries to update a defunct link.
> Because bpf_link to netns doesn't hold a ref to struct net, special care
> is
> taken when releasing the link. The netns might be getting torn down when
> the release function tries to access it to detach the link.
> To ensure the struct net object is alive when release function accesses it
> we rely on the fact that cleanup_net(), struct net destructor, calls
> synchronize_rcu() after invoking pre_exit callbacks. If auto-detach from
> pre_exit happens first, link release will not attempt to access struct
> net.
> Same applies the other way around, network namespace doesn't keep an
> attached link alive because by not holding a ref to it. Instead bpf_links
> to netns are RCU-freed, so that pernet pre_exit callback can safely access
> and auto-detach the link when racing with link release/free.
> Signed-off-by: Jakub Sitnicki <jakub@...udflare.com>
> ---
> include/linux/bpf-netns.h | 8 +
> include/net/netns/bpf.h | 1 +
> include/uapi/linux/bpf.h | 5 +
> kernel/bpf/net_namespace.c | 257 ++++++++++++++++++++++++++++++++-
> kernel/bpf/syscall.c | 3 +
> net/core/filter.c | 1 +
> tools/include/uapi/linux/bpf.h | 5 +
> 7 files changed, 278 insertions(+), 2 deletions(-)
> diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h
> index f3aec3d79824..4052d649f36d 100644
> --- a/include/linux/bpf-netns.h
> +++ b/include/linux/bpf-netns.h
> @@ -34,6 +34,8 @@ int netns_bpf_prog_query(const union bpf_attr *attr,
> int netns_bpf_prog_attach(const union bpf_attr *attr,
> struct bpf_prog *prog);
> int netns_bpf_prog_detach(const union bpf_attr *attr);
> +int netns_bpf_link_create(const union bpf_attr *attr,
> + struct bpf_prog *prog);
> #else
> static inline int netns_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> @@ -51,6 +53,12 @@ static inline int netns_bpf_prog_detach(const union
> bpf_attr *attr)
> {
> return -EOPNOTSUPP;
> }
> +
> +static inline int netns_bpf_link_create(const union bpf_attr *attr,
> + struct bpf_prog *prog)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif
> #endif /* _BPF_NETNS_H */
> diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h
> index a858d1c5b166..baabefdeb10c 100644
> --- a/include/net/netns/bpf.h
> +++ b/include/net/netns/bpf.h
> @@ -12,6 +12,7 @@ struct bpf_prog;
> struct netns_bpf {
> struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE];
> + struct bpf_link __rcu *links[MAX_NETNS_BPF_ATTACH_TYPE];
> };
> #endif /* __NETNS_BPF_H__ */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 54b93f8b49b8..05469d3c5c82 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -235,6 +235,7 @@ enum bpf_link_type {
> BPF_LINK_TYPE_TRACING = 2,
> BPF_LINK_TYPE_CGROUP = 3,
> BPF_LINK_TYPE_ITER = 4,
> + BPF_LINK_TYPE_NETNS = 5,
> MAX_BPF_LINK_TYPE,
> };
> @@ -3753,6 +3754,10 @@ struct bpf_link_info {
> __u64 cgroup_id;
> __u32 attach_type;
> } cgroup;
> + struct {
> + __u32 netns_ino;
> + __u32 attach_type;
> + } netns;
> };
> } __attribute__((aligned(8)));
> diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
> index fc89154aed27..1c6009ab93c5 100644
> --- a/kernel/bpf/net_namespace.c
> +++ b/kernel/bpf/net_namespace.c
> @@ -8,9 +8,155 @@
> * Functions to manage BPF programs attached to netns
> */
> -/* Protects updates to netns_bpf */
> +struct bpf_netns_link {
> + struct bpf_link link;
> + enum bpf_attach_type type;
> + enum netns_bpf_attach_type netns_type;
> +
> + /* struct net is not RCU-freed but we treat it as such because
> + * our pre_exit callback will NULL this pointer before
> + * cleanup_net() calls synchronize_rcu().
> + */
> + struct net __rcu *net;
> +
> + /* bpf_netns_link is RCU-freed for pre_exit callback invoked
> + * by cleanup_net() to safely access the link.
> + */
> + struct rcu_head rcu;
> +};
> +
> +/* Protects updates to netns_bpf. */
> DEFINE_MUTEX(netns_bpf_mutex);
> +static inline struct bpf_netns_link *to_bpf_netns_link(struct bpf_link
> *link)
> +{
> + return container_of(link, struct bpf_netns_link, link);
> +}
> +
> +/* Called with RCU read lock. */
As mentioned earlier, ^^^ probably doesn't make sense. Try to avoid
RCU_INIT_POINTER under read lock.
> +static void __net_exit
> +bpf_netns_link_auto_detach(struct net *net, enum netns_bpf_attach_type
> type)
> +{
> + struct bpf_netns_link *net_link;
> + struct bpf_link *link;
> +
> + link = rcu_dereference(net->bpf.links[type]);
Btw, maybe use rcu_deref_protected with !check_net here and drop
rcu read lock requirement?
> + if (!link)
> + return;
> + net_link = to_bpf_netns_link(link);
> + RCU_INIT_POINTER(net_link->net, NULL);
> +}
> +
> +static void bpf_netns_link_release(struct bpf_link *link)
> +{
> + struct bpf_netns_link *net_link = to_bpf_netns_link(link);
> + enum netns_bpf_attach_type type = net_link->netns_type;
> + struct net *net;
> +
> + /* Link auto-detached by dying netns. */
> + if (!rcu_access_pointer(net_link->net))
> + return;
> +
> + mutex_lock(&netns_bpf_mutex);
> +
> + /* Recheck after potential sleep. We can race with cleanup_net
> + * here, but if we see a non-NULL struct net pointer pre_exit
> + * and following synchronize_rcu() has not happened yet, and
> + * we have until the end of grace period to access net.
> + */
> + rcu_read_lock();
> + net = rcu_dereference(net_link->net);
Use rcu_dereferece_protected with netns_bpf_mutex here instead of
grabbing rcu read lock? Or are you guarding here against 'net'
going away? In that case, moving unlock higher can make it more
visually clear.
> + if (net) {
> + RCU_INIT_POINTER(net->bpf.links[type], NULL);
> + RCU_INIT_POINTER(net->bpf.progs[type], NULL);
> + }
> + rcu_read_unlock();
> +
> + mutex_unlock(&netns_bpf_mutex);
> +}
> +
> +static void bpf_netns_link_dealloc(struct bpf_link *link)
> +{
> + struct bpf_netns_link *net_link = to_bpf_netns_link(link);
> +
> + /* Delay kfree in case we're racing with cleanup_net. */
> + kfree_rcu(net_link, rcu);
> +}
> +
> +static int bpf_netns_link_update_prog(struct bpf_link *link,
> + struct bpf_prog *new_prog,
> + struct bpf_prog *old_prog)
> +{
> + struct bpf_netns_link *net_link = to_bpf_netns_link(link);
> + struct net *net;
> + int ret = 0;
> +
> + if (old_prog && old_prog != link->prog)
> + return -EPERM;
> + if (new_prog->type != link->prog->type)
> + return -EINVAL;
> +
> + mutex_lock(&netns_bpf_mutex);
> + rcu_read_lock();
Again, what are you protecting here? 'net' disappearing? Then maybe do:
rcu_read_lock
net = rcu_deref
valid = net && check_net(net)
rcu_read_unlock
if (!valid)
bail out.
Otherwise, those mutex_lock+rcu_read_lock are a bit confusing.
> +
> + net = rcu_dereference(net_link->net);
> + if (!net || !check_net(net)) {
> + /* Link auto-detached or netns dying */
> + ret = -ENOLINK;
> + goto out_unlock;
> + }
> +
> + old_prog = xchg(&link->prog, new_prog);
> + bpf_prog_put(old_prog);
> +
> +out_unlock:
> + rcu_read_unlock();
> + mutex_unlock(&netns_bpf_mutex);
> +
> + return ret;
> +}
> +
> +static int bpf_netns_link_fill_info(const struct bpf_link *link,
> + struct bpf_link_info *info)
> +{
> + const struct bpf_netns_link *net_link;
> + unsigned int inum;
> + struct net *net;
> +
> + net_link = container_of(link, struct bpf_netns_link, link);
> +
> + rcu_read_lock();
> + net = rcu_dereference(net_link->net);
> + if (net)
> + inum = net->ns.inum;
> + rcu_read_unlock();
> +
> + info->netns.netns_ino = inum;
> + info->netns.attach_type = net_link->type;
> + return 0;
> +}
> +
> +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
> + struct seq_file *seq)
> +{
> + struct bpf_link_info info = {};
> +
> + bpf_netns_link_fill_info(link, &info);
> + seq_printf(seq,
> + "netns_ino:\t%u\n"
> + "attach_type:\t%u\n",
> + info.netns.netns_ino,
> + info.netns.attach_type);
> +}
> +
> +static const struct bpf_link_ops bpf_netns_link_ops = {
> + .release = bpf_netns_link_release,
> + .dealloc = bpf_netns_link_dealloc,
> + .update_prog = bpf_netns_link_update_prog,
> + .fill_link_info = bpf_netns_link_fill_info,
> + .show_fdinfo = bpf_netns_link_show_fdinfo,
> +};
> +
> int netns_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> {
> @@ -67,6 +213,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr,
> struct bpf_prog *prog)
> net = current->nsproxy->net_ns;
> mutex_lock(&netns_bpf_mutex);
> +
> + /* Attaching prog directly is not compatible with links */
> + if (rcu_access_pointer(net->bpf.links[type])) {
> + ret = -EBUSY;
> + goto unlock;
> + }
> +
> switch (type) {
> case NETNS_BPF_FLOW_DISSECTOR:
> ret = flow_dissector_bpf_prog_attach(net, prog);
> @@ -75,6 +228,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr,
> struct bpf_prog *prog)
> ret = -EINVAL;
> break;
> }
> +unlock:
> mutex_unlock(&netns_bpf_mutex);
> return ret;
> @@ -85,6 +239,10 @@ static int __netns_bpf_prog_detach(struct net *net,
> {
> struct bpf_prog *attached;
> + /* Progs attached via links cannot be detached */
> + if (rcu_access_pointer(net->bpf.links[type]))
> + return -EBUSY;
> +
> /* No need for update-side lock when net is going away. */
> attached = rcu_dereference_protected(net->bpf.progs[type],
> !check_net(net) ||
> @@ -112,14 +270,109 @@ int netns_bpf_prog_detach(const union bpf_attr
> *attr)
> return ret;
> }
> +static int __netns_bpf_link_attach(struct net *net, struct bpf_link
> *link,
> + enum netns_bpf_attach_type type)
> +{
> + int err;
> +
> + /* Allow attaching only one prog or link for now */
> + if (rcu_access_pointer(net->bpf.links[type]))
> + return -E2BIG;
> + /* Links are not compatible with attaching prog directly */
> + if (rcu_access_pointer(net->bpf.progs[type]))
> + return -EBUSY;
> +
> + switch (type) {
> + case NETNS_BPF_FLOW_DISSECTOR:
> + err = flow_dissector_bpf_prog_attach(net, link->prog);
> + break;
> + default:
> + err = -EINVAL;
> + break;
> + }
> + if (!err)
> + rcu_assign_pointer(net->bpf.links[type], link);
> + return err;
> +}
> +
> +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
> + enum netns_bpf_attach_type type)
> +{
> + int ret;
> +
> + mutex_lock(&netns_bpf_mutex);
> + ret = __netns_bpf_link_attach(net, link, type);
What's the point of separating __netns_bpf_link_attach out?
Does it make sense to replace those rcu_access_pointer's in
__netns_bpf_link_attach with rcu_deref_protected and put everything here?
This makes it a bit easier to reason about.
> + mutex_unlock(&netns_bpf_mutex);
> +
> + return ret;
> +}
> +
> +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog
> *prog)
> +{
> + enum netns_bpf_attach_type netns_type;
> + struct bpf_link_primer link_primer;
> + struct bpf_netns_link *net_link;
> + enum bpf_attach_type type;
> + struct net *net;
> + int err;
> +
> + if (attr->link_create.flags)
> + return -EINVAL;
> +
> + type = attr->link_create.attach_type;
> + netns_type = to_netns_bpf_attach_type(type);
> + if (netns_type < 0)
> + return -EINVAL;
> +
> + net = get_net_ns_by_fd(attr->link_create.target_fd);
> + if (IS_ERR(net))
> + return PTR_ERR(net);
> +
> + net_link = kzalloc(sizeof(*net_link), GFP_USER);
> + if (!net_link) {
> + err = -ENOMEM;
> + goto out_put_net;
> + }
> + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
> + &bpf_netns_link_ops, prog);
> + rcu_assign_pointer(net_link->net, net);
> + net_link->type = type;
> + net_link->netns_type = netns_type;
> +
> + err = bpf_link_prime(&net_link->link, &link_primer);
> + if (err) {
> + kfree(net_link);
> + goto out_put_net;
> + }
> +
> + err = netns_bpf_link_attach(net, &net_link->link, netns_type);
> + if (err) {
> + bpf_link_cleanup(&link_primer);
> + goto out_put_net;
> + }
> +
> + err = bpf_link_settle(&link_primer);
> +out_put_net:
> + /* To auto-detach the link from netns when it is getting
> + * destroyed, we can't hold a ref to it. Instead, we rely on
> + * RCU when accessing link->net pointer.
> + */
> + put_net(net);
> + return err;
> +}
> +
> static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
> {
> enum netns_bpf_attach_type type;
> + rcu_read_lock();
> for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
> - if (rcu_access_pointer(net->bpf.progs[type]))
> + if (rcu_access_pointer(net->bpf.links[type]))
> + bpf_netns_link_auto_detach(net, type);
> + else if (rcu_access_pointer(net->bpf.progs[type]))
> __netns_bpf_prog_detach(net, type);
> }
> + rcu_read_unlock();
> }
> static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 93f329a6736d..47a7b426d75c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3874,6 +3874,9 @@ static int link_create(union bpf_attr *attr)
> case BPF_PROG_TYPE_TRACING:
> ret = tracing_bpf_link_attach(attr, prog);
> break;
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> + ret = netns_bpf_link_create(attr, prog);
> + break;
> default:
> ret = -EINVAL;
> }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index a6fc23447f12..b77391a1adb1 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -9081,6 +9081,7 @@ const struct bpf_verifier_ops
> sk_reuseport_verifier_ops = {
> const struct bpf_prog_ops sk_reuseport_prog_ops = {
> };
> +
> #endif /* CONFIG_INET */
> DEFINE_BPF_DISPATCHER(xdp)
> diff --git a/tools/include/uapi/linux/bpf.h
> b/tools/include/uapi/linux/bpf.h
> index 54b93f8b49b8..05469d3c5c82 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -235,6 +235,7 @@ enum bpf_link_type {
> BPF_LINK_TYPE_TRACING = 2,
> BPF_LINK_TYPE_CGROUP = 3,
> BPF_LINK_TYPE_ITER = 4,
> + BPF_LINK_TYPE_NETNS = 5,
> MAX_BPF_LINK_TYPE,
> };
> @@ -3753,6 +3754,10 @@ struct bpf_link_info {
> __u64 cgroup_id;
> __u32 attach_type;
> } cgroup;
> + struct {
> + __u32 netns_ino;
> + __u32 attach_type;
> + } netns;
> };
> } __attribute__((aligned(8)));
> --
> 2.25.4
Powered by blists - more mailing lists