[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190813171610.GH2820@mini-arch>
Date: Tue, 13 Aug 2019 10:16:10 -0700
From: Stanislav Fomichev <sdf@...ichev.me>
To: Yonghong Song <yhs@...com>
Cc: Stanislav Fomichev <sdf@...gle.com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"bpf@...r.kernel.org" <bpf@...r.kernel.org>,
"davem@...emloft.net" <davem@...emloft.net>,
"ast@...nel.org" <ast@...nel.org>,
"daniel@...earbox.net" <daniel@...earbox.net>,
Martin Lau <kafai@...com>
Subject: Re: [PATCH bpf-next v3 2/4] bpf: support cloning sk storage on
accept()
On 08/13, Yonghong Song wrote:
>
>
> On 8/13/19 9:26 AM, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from sk_clone_lock.
> >
> > Cc: Martin KaFai Lau <kafai@...com>
> > Cc: Yonghong Song <yhs@...com>
> > Acked-by: Yonghong Song <yhs@...com>
> > Signed-off-by: Stanislav Fomichev <sdf@...gle.com>
> > ---
> > include/net/bpf_sk_storage.h | 10 ++++
> > include/uapi/linux/bpf.h | 3 +
> > net/core/bpf_sk_storage.c | 103 ++++++++++++++++++++++++++++++++++-
> > net/core/sock.c | 9 ++-
> > 4 files changed, 119 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > index b9dcb02e756b..8e4f831d2e52 100644
> > --- a/include/net/bpf_sk_storage.h
> > +++ b/include/net/bpf_sk_storage.h
> > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> > extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> > extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> >
> > +#ifdef CONFIG_BPF_SYSCALL
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > +#else
> > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > + struct sock *newsk)
> > +{
> > + return 0;
> > +}
> > +#endif
> > +
> > #endif /* _BPF_SK_STORAGE_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..0ef594ac3899 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -337,6 +337,9 @@ enum bpf_attach_type {
> > #define BPF_F_RDONLY_PROG (1U << 7)
> > #define BPF_F_WRONLY_PROG (1U << 8)
> >
> > +/* Clone map from listener for newly accepted socket */
> > +#define BPF_F_CLONE (1U << 9)
> > +
> > /* flags for BPF_PROG_QUERY */
> > #define BPF_F_QUERY_EFFECTIVE (1U << 0)
> >
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index 94c7f77ecb6b..1bc7de7e18ba 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -12,6 +12,9 @@
> >
> > static atomic_t cache_idx;
> >
> > +#define SK_STORAGE_CREATE_FLAG_MASK \
> > + (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > +
> > struct bucket {
> > struct hlist_head list;
> > raw_spinlock_t lock;
> > @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
> > kfree_rcu(sk_storage, rcu);
> > }
> >
> > -/* sk_storage->lock must be held and sk_storage->list cannot be empty */
> > static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
> > struct bpf_sk_storage_elem *selem)
> > {
> > @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> > return 0;
> > }
> >
> > -/* Called by __sk_destruct() */
> > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> > void bpf_sk_storage_free(struct sock *sk)
> > {
> > struct bpf_sk_storage_elem *selem;
> > @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
> >
> > smap = (struct bpf_sk_storage_map *)map;
> >
> > + /* Note that this map might be concurrently cloned from
> > + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
> > + * RCU read section to finish before proceeding. New RCU
> > + * read sections should be prevented via bpf_map_inc_not_zero.
> > + */
> > synchronize_rcu();
> >
> > /* bpf prog and the userspace can no longer access this map
> > @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
> >
> > static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
> > {
> > - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
> > + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
> > + !(attr->map_flags & BPF_F_NO_PREALLOC) ||
> > + attr->max_entries ||
> > attr->key_size != sizeof(int) || !attr->value_size ||
> > /* Enforce BTF for userspace sk dumping */
> > !attr->btf_key_type_id || !attr->btf_value_type_id)
> > @@ -739,6 +748,94 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> > return err;
> > }
> >
> > +static struct bpf_sk_storage_elem *
> > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > + struct bpf_sk_storage_map *smap,
> > + struct bpf_sk_storage_elem *selem)
> > +{
> > + struct bpf_sk_storage_elem *copy_selem;
> > +
> > + copy_selem = selem_alloc(smap, newsk, NULL, true);
> > + if (!copy_selem)
> > + return NULL;
> > +
> > + if (map_value_has_spin_lock(&smap->map))
> > + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > + SDATA(selem)->data, true);
> > + else
> > + copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > + SDATA(selem)->data);
> > +
> > + return copy_selem;
> > +}
> > +
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > + struct bpf_sk_storage *new_sk_storage = NULL;
> > + struct bpf_sk_storage *sk_storage;
> > + struct bpf_sk_storage_elem *selem;
> > + int ret;
> > +
> > + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > + rcu_read_lock();
> > + sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > + if (!sk_storage || hlist_empty(&sk_storage->list))
> > + goto out;
> > +
> > + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > + struct bpf_sk_storage_elem *copy_selem;
> > + struct bpf_sk_storage_map *smap;
> > + struct bpf_map *map;
> > +
> > + smap = rcu_dereference(SDATA(selem)->smap);
> > + if (!(smap->map.map_flags & BPF_F_CLONE))
> > + continue;
> > +
> > + map = bpf_map_inc_not_zero(&smap->map, false);
> > + if (IS_ERR(map))
> > + continue;
> > +
> > + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > + if (!copy_selem) {
> > + ret = -ENOMEM;
> > + bpf_map_put(map);
> > + goto err;
> > + }
> > +
> > + if (new_sk_storage) {
> > + selem_link_map(smap, copy_selem);
> > + __selem_link_sk(new_sk_storage, copy_selem);
> > + } else {
> > + ret = sk_storage_alloc(newsk, smap, copy_selem);
> > + if (ret) {
> > + kfree(copy_selem);
> > + atomic_sub(smap->elem_size,
> > + &newsk->sk_omem_alloc);
> > + bpf_map_put(map);
> > + goto err;
> > + }
> > +
> > + new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > + }
> > + bpf_map_put(map);
> > + }
> > +
> > +out:
> > + rcu_read_unlock();
> > + return 0;
> > +
> > +err:
> > + rcu_read_unlock();
> > +
> > + /* Don't free anything explicitly here, caller is responsible to
> > + * call bpf_sk_storage_free in case of an error.
> > + */
> > +
> > + return ret;
>
> A nit.
> If you set ret = 0 initially, you do not need the above two
> rcu_read_unlock(). One "return ret" should be enough.
> The comment can be changed to
> /* In case of an error, ... */
I thought about it, but decided to keep them separate because of
this comment. OTOH, adding "In case of an error," might help with
that. Can do another respin once Martin gets to review this
version (don't want to spam).
> > +}
> > +
> > BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > void *, value, u64, flags)
> > {
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index d57b0cc995a0..f5e801a9cea4 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> > goto out;
> > }
> > RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > -#ifdef CONFIG_BPF_SYSCALL
> > - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > -#endif
> > +
> > + if (bpf_sk_storage_clone(sk, newsk)) {
> > + sk_free_unlock_clone(newsk);
> > + newsk = NULL;
> > + goto out;
> > + }
> >
> > newsk->sk_err = 0;
> > newsk->sk_err_soft = 0;
> >
Powered by blists - more mailing lists