[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200331000513.GA54465@rdna-mbp.dhcp.thefacebook.com>
Date: Mon, 30 Mar 2020 17:05:13 -0700
From: Andrey Ignatov <rdna@...com>
To: Andrii Nakryiko <andriin@...com>
CC: <bpf@...r.kernel.org>, <netdev@...r.kernel.org>, <ast@...com>,
<daniel@...earbox.net>, <andrii.nakryiko@...il.com>,
<kernel-team@...com>
Subject: Re: [PATCH v3 bpf-next 1/4] bpf: implement bpf_link-based cgroup BPF
program attachment
Andrii Nakryiko <andriin@...com> [Sun, 2020-03-29 20:00 -0700]:
> Implement new sub-command to attach cgroup BPF programs and return FD-based
> bpf_link back on success. bpf_link, once attached to cgroup, cannot be
> replaced, except by owner having its FD. Cgroup bpf_link supports only
> BPF_F_ALLOW_MULTI semantics. Both link-based and prog-based BPF_F_ALLOW_MULTI
> attachments can be freely intermixed.
>
> To prevent bpf_cgroup_link from keeping cgroup alive past the point when no
> BPF program can be executed, implement auto-detachment of link. When
> cgroup_bpf_release() is called, all attached bpf_links are forced to release
> cgroup refcounts, but they leave bpf_link otherwise active and allocated, as
> well as still owning underlying bpf_prog. This is because user-space might
> still have FDs open and active, so bpf_link as a user-referenced object can't
> be freed yet. Once last active FD is closed, bpf_link will be freed and
> underlying bpf_prog refcount will be dropped. But cgroup refcount won't be
> touched, because cgroup is released already.
>
> The inherent race between bpf_cgroup_link release (from closing last FD) and
> cgroup_bpf_release() is resolved by both operations taking cgroup_mutex. So
> the only additional check required is when bpf_cgroup_link attempts to detach
> itself from cgroup. At that time we need to check whether there is still
> cgroup associated with that link. And if not, exit with success, because
> bpf_cgroup_link was already successfully detached.
>
> Acked-by: Roman Gushchin <guro@...com>
> Signed-off-by: Andrii Nakryiko <andriin@...com>
> ---
> include/linux/bpf-cgroup.h | 29 ++-
> include/linux/bpf.h | 10 +-
> include/uapi/linux/bpf.h | 10 +-
> kernel/bpf/cgroup.c | 315 +++++++++++++++++++++++++--------
> kernel/bpf/syscall.c | 61 ++++++-
> kernel/cgroup/cgroup.c | 14 +-
> tools/include/uapi/linux/bpf.h | 10 +-
> 7 files changed, 351 insertions(+), 98 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index a7cd5c7a2509..d2d969669564 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -51,9 +51,18 @@ struct bpf_cgroup_storage {
> struct rcu_head rcu;
> };
>
> +struct bpf_cgroup_link {
> + struct bpf_link link;
> + struct cgroup *cgroup;
> + enum bpf_attach_type type;
> +};
> +
> +extern const struct bpf_link_ops bpf_cgroup_link_lops;
> +
> struct bpf_prog_list {
> struct list_head node;
> struct bpf_prog *prog;
> + struct bpf_cgroup_link *link;
> struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
> };
>
> @@ -84,20 +93,23 @@ struct cgroup_bpf {
> int cgroup_bpf_inherit(struct cgroup *cgrp);
> void cgroup_bpf_offline(struct cgroup *cgrp);
>
> -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> - struct bpf_prog *replace_prog,
> +int __cgroup_bpf_attach(struct cgroup *cgrp,
> + struct bpf_prog *prog, struct bpf_prog *replace_prog,
> + struct bpf_cgroup_link *link,
> enum bpf_attach_type type, u32 flags);
> int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> + struct bpf_cgroup_link *link,
> enum bpf_attach_type type);
> int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> union bpf_attr __user *uattr);
>
> /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
> -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> - struct bpf_prog *replace_prog, enum bpf_attach_type type,
> +int cgroup_bpf_attach(struct cgroup *cgrp,
> + struct bpf_prog *prog, struct bpf_prog *replace_prog,
> + struct bpf_cgroup_link *link, enum bpf_attach_type type,
> u32 flags);
> int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> - enum bpf_attach_type type, u32 flags);
> + enum bpf_attach_type type);
> int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> union bpf_attr __user *uattr);
>
> @@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
> enum bpf_prog_type ptype, struct bpf_prog *prog);
> int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> enum bpf_prog_type ptype);
> +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
> int cgroup_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr);
> #else
> @@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> return -EINVAL;
> }
>
> +static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog)
> +{
> + return -EINVAL;
> +}
> +
> static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> {
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 3bde59a8453b..56254d880293 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled;
> int bpf_map_new_fd(struct bpf_map *map, int flags);
> int bpf_prog_new_fd(struct bpf_prog *prog);
>
> -struct bpf_link;
> +struct bpf_link {
> + atomic64_t refcnt;
> + const struct bpf_link_ops *ops;
> + struct bpf_prog *prog;
> + struct work_struct work;
> +};
>
> struct bpf_link_ops {
> void (*release)(struct bpf_link *link);
> void (*dealloc)(struct bpf_link *link);
> +
> };
>
> void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
> struct bpf_prog *prog);
> +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
> + int link_fd);
> void bpf_link_inc(struct bpf_link *link);
> void bpf_link_put(struct bpf_link *link);
> int bpf_link_new_fd(struct bpf_link *link);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f1fbc36f58d3..8b3f1c098ac0 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -111,6 +111,7 @@ enum bpf_cmd {
> BPF_MAP_LOOKUP_AND_DELETE_BATCH,
> BPF_MAP_UPDATE_BATCH,
> BPF_MAP_DELETE_BATCH,
> + BPF_LINK_CREATE,
> };
>
> enum bpf_map_type {
> @@ -541,7 +542,7 @@ union bpf_attr {
> __u32 prog_cnt;
> } query;
>
> - struct {
> + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
> __u64 name;
> __u32 prog_fd;
> } raw_tracepoint;
> @@ -569,6 +570,13 @@ union bpf_attr {
> __u64 probe_offset; /* output: probe_offset */
> __u64 probe_addr; /* output: probe_addr */
> } task_fd_query;
> +
> + struct { /* struct used by BPF_LINK_CREATE command */
> + __u32 prog_fd; /* eBPF program to attach */
> + __u32 target_fd; /* object to attach to */
> + __u32 attach_type; /* attach type */
> + __u32 flags; /* extra flags */
> + } link_create;
> } __attribute__((aligned(8)));
>
> /* The description below is an attempt at providing documentation to eBPF
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 9c8472823a7f..c24029937431 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
> bpf_cgroup_storage_unlink(storages[stype]);
> }
>
> +/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
> + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
> + * doesn't free link memory, which will eventually be done by bpf_link's
> + * release() callback, when its last FD is closed.
> + */
> +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
> +{
> + cgroup_put(link->cgroup);
> + link->cgroup = NULL;
> +}
> +
> /**
> * cgroup_bpf_release() - put references of all bpf programs and
> * release all cgroup bpf data
> @@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work)
>
> list_for_each_entry_safe(pl, tmp, progs, node) {
> list_del(&pl->node);
> - bpf_prog_put(pl->prog);
> + if (pl->prog)
> + bpf_prog_put(pl->prog);
> + if (pl->link)
> + bpf_cgroup_link_auto_detach(pl->link);
> bpf_cgroup_storages_unlink(pl->storage);
> bpf_cgroup_storages_free(pl->storage);
> kfree(pl);
> @@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
> queue_work(system_wq, &cgrp->bpf.release_work);
> }
>
> +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
> + * link or direct prog.
> + */
> +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
> +{
> + if (pl->prog)
> + return pl->prog;
> + if (pl->link)
> + return pl->link->link.prog;
> + return NULL;
> +}
> +
> /* count number of elements in the list.
> * it's slow but the list cannot be long
> */
> @@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head)
> u32 cnt = 0;
>
> list_for_each_entry(pl, head, node) {
> - if (!pl->prog)
> + if (!prog_list_prog(pl))
> continue;
> cnt++;
> }
> @@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp,
> continue;
>
> list_for_each_entry(pl, &p->bpf.progs[type], node) {
> - if (!pl->prog)
> + if (!prog_list_prog(pl))
> continue;
>
> item = &progs->items[cnt];
> - item->prog = pl->prog;
> + item->prog = prog_list_prog(pl);
> bpf_cgroup_storages_assign(item->cgroup_storage,
> pl->storage);
> cnt++;
> @@ -333,19 +359,60 @@ static int update_effective_progs(struct cgroup *cgrp,
>
> #define BPF_CGROUP_MAX_PROGS 64
>
> +static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
> + struct bpf_prog *prog,
> + struct bpf_cgroup_link *link,
> + struct bpf_prog *replace_prog,
> + bool allow_multi)
> +{
> + struct bpf_prog_list *pl;
> +
> + /* single-attach case */
> + if (!allow_multi) {
> + if (list_empty(progs))
> + return NULL;
> + return list_first_entry(progs, typeof(*pl), node);
> + }
> +
> + list_for_each_entry(pl, progs, node) {
> + if (prog && pl->prog == prog)
> + /* disallow attaching the same prog twice */
> + return ERR_PTR(-EINVAL);
> + if (link && pl->link == link)
> + /* disallow attaching the same link twice */
> + return ERR_PTR(-EINVAL);
> + }
> +
> + /* direct prog multi-attach w/ replacement case */
> + if (replace_prog) {
> + list_for_each_entry(pl, progs, node) {
> + if (pl->prog == replace_prog)
> + /* a match found */
> + return pl;
> + }
> + /* prog to replace not found for cgroup */
> + return ERR_PTR(-ENOENT);
> + }
> +
> + return NULL;
> +}
> +
> /**
> - * __cgroup_bpf_attach() - Attach the program to a cgroup, and
> + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
> * propagate the change to descendants
> * @cgrp: The cgroup which descendants to traverse
> * @prog: A program to attach
> + * @link: A link to attach
> * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
> * @type: Type of attach operation
> * @flags: Option flags
> *
> + * Exactly one of @prog or @link can be non-null.
> * Must be called with cgroup_mutex held.
> */
> -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> - struct bpf_prog *replace_prog,
> +int __cgroup_bpf_attach(struct cgroup *cgrp,
> + struct bpf_prog *prog, struct bpf_prog *replace_prog,
> + struct bpf_cgroup_link *link,
> enum bpf_attach_type type, u32 flags)
> {
> u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
> @@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> struct bpf_prog *old_prog = NULL;
> struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
> *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
> - struct bpf_prog_list *pl, *replace_pl = NULL;
> + struct bpf_prog_list *pl;
> int err;
>
> if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
> ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
> /* invalid combination */
> return -EINVAL;
> + if (link && (prog || replace_prog))
> + /* only either link or prog/replace_prog can be specified */
> + return -EINVAL;
> + if (!!replace_prog != !!(flags & BPF_F_REPLACE))
> + /* replace_prog implies BPF_F_REPLACE, and vice versa */
> + return -EINVAL;
>
> if (!hierarchy_allows_attach(cgrp, type))
> return -EPERM;
> @@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
> return -E2BIG;
>
> - if (flags & BPF_F_ALLOW_MULTI) {
> - list_for_each_entry(pl, progs, node) {
> - if (pl->prog == prog)
> - /* disallow attaching the same prog twice */
> - return -EINVAL;
> - if (pl->prog == replace_prog)
> - replace_pl = pl;
> - }
> - if ((flags & BPF_F_REPLACE) && !replace_pl)
> - /* prog to replace not found for cgroup */
> - return -ENOENT;
> - } else if (!list_empty(progs)) {
> - replace_pl = list_first_entry(progs, typeof(*pl), node);
> - }
> + pl = find_attach_entry(progs, prog, link, replace_prog,
> + flags & BPF_F_ALLOW_MULTI);
> + if (IS_ERR(pl))
> + return PTR_ERR(pl);
>
> - if (bpf_cgroup_storages_alloc(storage, prog))
> + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
> return -ENOMEM;
>
> - if (replace_pl) {
> - pl = replace_pl;
> + if (pl) {
> old_prog = pl->prog;
> bpf_cgroup_storages_unlink(pl->storage);
> bpf_cgroup_storages_assign(old_storage, pl->storage);
> @@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> }
>
> pl->prog = prog;
> + pl->link = link;
> bpf_cgroup_storages_assign(pl->storage, storage);
> cgrp->bpf.flags[type] = saved_flags;
>
> @@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> if (err)
> goto cleanup;
>
> - static_branch_inc(&cgroup_bpf_enabled_key);
> bpf_cgroup_storages_free(old_storage);
> - if (old_prog) {
> + if (old_prog)
> bpf_prog_put(old_prog);
> - static_branch_dec(&cgroup_bpf_enabled_key);
> - }
> - bpf_cgroup_storages_link(storage, cgrp, type);
> + else
> + static_branch_inc(&cgroup_bpf_enabled_key);
> + bpf_cgroup_storages_link(pl->storage, cgrp, type);
> return 0;
>
> cleanup:
> - /* and cleanup the prog list */
> - pl->prog = old_prog;
> + if (old_prog) {
> + pl->prog = old_prog;
> + pl->link = NULL;
> + }
> bpf_cgroup_storages_free(pl->storage);
> bpf_cgroup_storages_assign(pl->storage, old_storage);
> bpf_cgroup_storages_link(pl->storage, cgrp, type);
> - if (!replace_pl) {
> + if (!old_prog) {
> list_del(&pl->node);
> kfree(pl);
> }
> return err;
> }
>
> +static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
> + struct bpf_prog *prog,
> + struct bpf_cgroup_link *link,
> + bool allow_multi)
> +{
> + struct bpf_prog_list *pl;
> +
> + if (!allow_multi) {
> + if (list_empty(progs))
> + /* report error when trying to detach and nothing is attached */
> + return ERR_PTR(-ENOENT);
> +
> + /* to maintain backward compatibility NONE and OVERRIDE cgroups
> + * allow detaching with invalid FD (prog==NULL) in legacy mode
> + */
> + return list_first_entry(progs, typeof(*pl), node);
> + }
> +
> + if (!prog && !link)
> + /* to detach MULTI prog the user has to specify valid FD
> + * of the program or link to be detached
> + */
> + return ERR_PTR(-EINVAL);
> +
> + /* find the prog or link and detach it */
> + list_for_each_entry(pl, progs, node) {
> + if (pl->prog == prog && pl->link == link)
> + return pl;
> + }
> + return ERR_PTR(-ENOENT);
> +}
> +
> /**
> - * __cgroup_bpf_detach() - Detach the program from a cgroup, and
> + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
> * propagate the change to descendants
> * @cgrp: The cgroup which descendants to traverse
> * @prog: A program to detach or NULL
> + * @prog: A link to detach or NULL
> * @type: Type of detach operation
> *
> + * At most one of @prog or @link can be non-NULL.
> * Must be called with cgroup_mutex held.
> */
> int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> - enum bpf_attach_type type)
> + struct bpf_cgroup_link *link, enum bpf_attach_type type)
> {
> struct list_head *progs = &cgrp->bpf.progs[type];
> u32 flags = cgrp->bpf.flags[type];
> - struct bpf_prog *old_prog = NULL;
> struct bpf_prog_list *pl;
> + struct bpf_prog *old_prog;
> int err;
>
> - if (flags & BPF_F_ALLOW_MULTI) {
> - if (!prog)
> - /* to detach MULTI prog the user has to specify valid FD
> - * of the program to be detached
> - */
> - return -EINVAL;
> - } else {
> - if (list_empty(progs))
> - /* report error when trying to detach and nothing is attached */
> - return -ENOENT;
> - }
> + if (prog && link)
> + /* only one of prog or link can be specified */
> + return -EINVAL;
>
> - if (flags & BPF_F_ALLOW_MULTI) {
> - /* find the prog and detach it */
> - list_for_each_entry(pl, progs, node) {
> - if (pl->prog != prog)
> - continue;
> - old_prog = prog;
> - /* mark it deleted, so it's ignored while
> - * recomputing effective
> - */
> - pl->prog = NULL;
> - break;
> - }
> - if (!old_prog)
> - return -ENOENT;
> - } else {
> - /* to maintain backward compatibility NONE and OVERRIDE cgroups
> - * allow detaching with invalid FD (prog==NULL)
> - */
> - pl = list_first_entry(progs, typeof(*pl), node);
> - old_prog = pl->prog;
> - pl->prog = NULL;
> - }
> + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
> + if (IS_ERR(pl))
> + return PTR_ERR(pl);
> +
> + /* mark it deleted, so it's ignored while recomputing effective */
> + old_prog = pl->prog;
> + pl->prog = NULL;
> + pl->link = NULL;
>
> err = update_effective_progs(cgrp, type);
> if (err)
> @@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> if (list_empty(progs))
> /* last program was detached, reset flags to zero */
> cgrp->bpf.flags[type] = 0;
> -
> - bpf_prog_put(old_prog);
> + if (old_prog)
> + bpf_prog_put(old_prog);
> static_branch_dec(&cgroup_bpf_enabled_key);
> return 0;
>
> cleanup:
> - /* and restore back old_prog */
> + /* restore back prog or link */
> pl->prog = old_prog;
> + pl->link = link;
> return err;
> }
>
> @@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> struct list_head *progs = &cgrp->bpf.progs[type];
> u32 flags = cgrp->bpf.flags[type];
> struct bpf_prog_array *effective;
> + struct bpf_prog *prog;
> int cnt, ret = 0, i;
>
> effective = rcu_dereference_protected(cgrp->bpf.effective[type],
> @@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>
> i = 0;
> list_for_each_entry(pl, progs, node) {
> - id = pl->prog->aux->id;
> + prog = prog_list_prog(pl);
> + id = prog->aux->id;
> if (copy_to_user(prog_ids + i, &id, sizeof(id)))
> return -EFAULT;
> if (++i == cnt)
> @@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
> }
> }
>
> - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
> - attr->attach_flags);
> + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
> + attr->attach_type, attr->attach_flags);
>
> if (replace_prog)
> bpf_prog_put(replace_prog);
> @@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
> if (IS_ERR(prog))
> prog = NULL;
>
> - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
> + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
> if (prog)
> bpf_prog_put(prog);
>
> @@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
> return ret;
> }
>
> +static void bpf_cgroup_link_release(struct bpf_link *link)
> +{
> + struct bpf_cgroup_link *cg_link =
> + container_of(link, struct bpf_cgroup_link, link);
> +
> + /* link might have been auto-detached by dying cgroup already,
> + * in that case our work is done here
> + */
> + if (!cg_link->cgroup)
> + return;
> +
> + mutex_lock(&cgroup_mutex);
> +
> + /* re-check cgroup under lock again */
> + if (!cg_link->cgroup) {
> + mutex_unlock(&cgroup_mutex);
> + return;
> + }
> +
> + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
> + cg_link->type));
> +
> + mutex_unlock(&cgroup_mutex);
> + cgroup_put(cg_link->cgroup);
> +}
> +
> +static void bpf_cgroup_link_dealloc(struct bpf_link *link)
> +{
> + struct bpf_cgroup_link *cg_link =
> + container_of(link, struct bpf_cgroup_link, link);
> +
> + kfree(cg_link);
> +}
> +
> +const struct bpf_link_ops bpf_cgroup_link_lops = {
> + .release = bpf_cgroup_link_release,
> + .dealloc = bpf_cgroup_link_dealloc,
> +};
> +
> +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> + struct bpf_cgroup_link *link;
> + struct file *link_file;
> + struct cgroup *cgrp;
> + int err, link_fd;
> +
> + if (attr->link_create.flags)
> + return -EINVAL;
> +
> + cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
> + if (IS_ERR(cgrp))
> + return PTR_ERR(cgrp);
> +
> + link = kzalloc(sizeof(*link), GFP_USER);
> + if (!link) {
> + err = -ENOMEM;
> + goto out_put_cgroup;
> + }
> + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
> + link->cgroup = cgrp;
> + link->type = attr->link_create.attach_type;
> +
> + link_file = bpf_link_new_file(&link->link, &link_fd);
> + if (IS_ERR(link_file)) {
> + kfree(link);
> + err = PTR_ERR(link_file);
> + goto out_put_cgroup;
> + }
> +
> + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
> + BPF_F_ALLOW_MULTI);
> + if (err) {
> + bpf_link_cleanup(&link->link, link_file, link_fd);
> + goto out_put_cgroup;
> + }
> +
> + fd_install(link_fd, link_file);
> + return link_fd;
> +
> +out_put_cgroup:
> + cgroup_put(cgrp);
> + return err;
> +}
> +
> int cgroup_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a616b63f23b4..05412b83ed6c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr)
> attr->file_flags);
> }
>
> -struct bpf_link {
> - atomic64_t refcnt;
> - const struct bpf_link_ops *ops;
> - struct bpf_prog *prog;
> - struct work_struct work;
> -};
> -
> void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
> struct bpf_prog *prog)
> {
> @@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
> * anon_inode's release() call. This helper manages marking bpf_link as
> * defunct, releases anon_inode file and puts reserved FD.
> */
> -static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
> - int link_fd)
> +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
> + int link_fd)
> {
> link->prog = NULL;
> fput(link_file);
> @@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
> link_type = "raw_tracepoint";
> else if (link->ops == &bpf_tracing_link_lops)
> link_type = "tracing";
> +#ifdef CONFIG_CGROUP_BPF
> + else if (link->ops == &bpf_cgroup_link_lops)
> + link_type = "cgroup";
> +#endif
> else
> link_type = "unknown";
>
> @@ -3553,6 +3550,49 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
> return err;
> }
>
> +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
> +static int link_create(union bpf_attr *attr)
> +{
>From what I see this function does not check any capability whether the
existing bpf_prog_attach() checks for CAP_NET_ADMIN.
This is pretty importnant difference but I don't see it clarified in the
commit message or discussed (or I missed it?).
Having a way to attach cgroup bpf prog by non-priv users is actually
helpful in some use-cases, e.g. systemd required patching in the past to
make it work with user (non-priv) sessions, see [0].
But in other cases it's also useful to limit the ability to attach
programs to a cgroup while using bpf_link so that only the thing that
controls cgroup setup can attach but not any non-priv process running in
that cgroup. How is this use-case covered in BPF_LINK_CREATE?
[0] https://github.com/systemd/systemd/pull/12745
> + enum bpf_prog_type ptype;
> + struct bpf_prog *prog;
> + int ret;
> +
> + if (CHECK_ATTR(BPF_LINK_CREATE))
> + return -EINVAL;
> +
> + ptype = attach_type_to_prog_type(attr->link_create.attach_type);
> + if (ptype == BPF_PROG_TYPE_UNSPEC)
> + return -EINVAL;
> +
> + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
> + if (IS_ERR(prog))
> + return PTR_ERR(prog);
> +
> + ret = bpf_prog_attach_check_attach_type(prog,
> + attr->link_create.attach_type);
> + if (ret)
> + goto err_out;
> +
> + switch (ptype) {
> + case BPF_PROG_TYPE_CGROUP_SKB:
> + case BPF_PROG_TYPE_CGROUP_SOCK:
> + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
> + case BPF_PROG_TYPE_SOCK_OPS:
> + case BPF_PROG_TYPE_CGROUP_DEVICE:
> + case BPF_PROG_TYPE_CGROUP_SYSCTL:
> + case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> + ret = cgroup_bpf_link_attach(attr, prog);
> + break;
> + default:
> + ret = -EINVAL;
> + }
> +
> +err_out:
> + if (ret < 0)
> + bpf_prog_put(prog);
> + return ret;
> +}
> +
> SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
> {
> union bpf_attr attr = {};
> @@ -3663,6 +3703,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> case BPF_MAP_DELETE_BATCH:
> err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
> break;
> + case BPF_LINK_CREATE:
> + err = link_create(&attr);
> + break;
> default:
> err = -EINVAL;
> break;
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 3dead0416b91..219624fba9ba 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
> #endif /* CONFIG_SOCK_CGROUP_DATA */
>
> #ifdef CONFIG_CGROUP_BPF
> -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
> - struct bpf_prog *replace_prog, enum bpf_attach_type type,
> +int cgroup_bpf_attach(struct cgroup *cgrp,
> + struct bpf_prog *prog, struct bpf_prog *replace_prog,
> + struct bpf_cgroup_link *link,
> + enum bpf_attach_type type,
> u32 flags)
> {
> int ret;
>
> mutex_lock(&cgroup_mutex);
> - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
> + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
> mutex_unlock(&cgroup_mutex);
> return ret;
> }
> +
> int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> - enum bpf_attach_type type, u32 flags)
> + enum bpf_attach_type type)
> {
> int ret;
>
> mutex_lock(&cgroup_mutex);
> - ret = __cgroup_bpf_detach(cgrp, prog, type);
> + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
> mutex_unlock(&cgroup_mutex);
> return ret;
> }
> +
> int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> {
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index f1fbc36f58d3..8b3f1c098ac0 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -111,6 +111,7 @@ enum bpf_cmd {
> BPF_MAP_LOOKUP_AND_DELETE_BATCH,
> BPF_MAP_UPDATE_BATCH,
> BPF_MAP_DELETE_BATCH,
> + BPF_LINK_CREATE,
> };
>
> enum bpf_map_type {
> @@ -541,7 +542,7 @@ union bpf_attr {
> __u32 prog_cnt;
> } query;
>
> - struct {
> + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
> __u64 name;
> __u32 prog_fd;
> } raw_tracepoint;
> @@ -569,6 +570,13 @@ union bpf_attr {
> __u64 probe_offset; /* output: probe_offset */
> __u64 probe_addr; /* output: probe_addr */
> } task_fd_query;
> +
> + struct { /* struct used by BPF_LINK_CREATE command */
> + __u32 prog_fd; /* eBPF program to attach */
> + __u32 target_fd; /* object to attach to */
> + __u32 attach_type; /* attach type */
> + __u32 flags; /* extra flags */
> + } link_create;
> } __attribute__((aligned(8)));
>
> /* The description below is an attempt at providing documentation to eBPF
> --
> 2.17.1
>
--
Andrey Ignatov
Powered by blists - more mailing lists