netdev - Re: [bpf-next V5 PATCH 02/15] xdp: introduce xdp_return

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKgT0UeZRHWOW4gsQD7OGWAD99ObHEqSUkvEdZjDG-iUY3wpSg@mail.gmail.com>
Date:   Fri, 23 Mar 2018 09:35:47 -0700
From:   Alexander Duyck <alexander.duyck@...il.com>
To:     Jesper Dangaard Brouer <brouer@...hat.com>
Cc:     Netdev <netdev@...r.kernel.org>,
        BjörnTöpel <bjorn.topel@...el.com>,
        "Karlsson, Magnus" <magnus.karlsson@...el.com>,
        Eugenia Emantayev <eugenia@...lanox.com>,
        Jason Wang <jasowang@...hat.com>,
        John Fastabend <john.fastabend@...il.com>,
        Eran Ben Elisha <eranbe@...lanox.com>,
        Saeed Mahameed <saeedm@...lanox.com>,
        Gal Pressman <galp@...lanox.com>,
        Daniel Borkmann <borkmann@...earbox.net>,
        Alexei Starovoitov <alexei.starovoitov@...il.com>,
        Tariq Toukan <tariqt@...lanox.com>
Subject: Re: [bpf-next V5 PATCH 02/15] xdp: introduce xdp_return_frame API and
 use in cpumap

On Fri, Mar 23, 2018 at 5:18 AM, Jesper Dangaard Brouer
<brouer@...hat.com> wrote:
> Introduce an xdp_return_frame API, and convert over cpumap as
> the first user, given it have queued XDP frame structure to leverage.
>
> V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@...hat.com>
> ---
>  include/net/xdp.h   |   28 ++++++++++++++++++++++++
>  kernel/bpf/cpumap.c |   60 +++++++++++++++++++++++++++++++--------------------
>  net/core/xdp.c      |   18 +++++++++++++++
>  3 files changed, 82 insertions(+), 24 deletions(-)
>
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index b2362ddfa694..15b546325e31 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -33,16 +33,44 @@
>   * also mandatory during RX-ring setup.
>   */
>
> +enum mem_type {
> +       MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
> +       MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
> +       MEM_TYPE_MAX,
> +};
> +
> +struct xdp_mem_info {
> +       u32 type; /* enum mem_type, but known size type */

Do you really need to make t his a full u32 value for something that
is likely never going to exceed a single digit value?

> +       /* u32 id; will be added later in this patchset */

Wouldn't it be better to just hold off and add it then instead of
adding it as a comment?

> +};
> +
>  struct xdp_rxq_info {
>         struct net_device *dev;
>         u32 queue_index;
>         u32 reg_state;
> +       struct xdp_mem_info mem;
>  } ____cacheline_aligned; /* perf critical, avoid false-sharing */
>
> +
> +static inline
> +void xdp_return_frame(void *data, struct xdp_mem_info *mem)
> +{
> +       if (mem->type == MEM_TYPE_PAGE_SHARED)
> +               page_frag_free(data);
> +
> +       if (mem->type == MEM_TYPE_PAGE_ORDER0) {
> +               struct page *page = virt_to_page(data); /* Assumes order0 page*/
> +
> +               put_page(page);
> +       }

Actually page_frag_free would probably work for either one. Also is it
safe to assume that the page is order 0? Are the only users of
compound pages that support XDP also only supporting the page fragment
setup?

Also you probably don't need put_page. It might be better to use
__free_page if you are certain the pages are coming from the Rx path
of drivers and don't have any special destructors associated with
them.

> +}
> +
>  int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
>                      struct net_device *dev, u32 queue_index);
>  void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
>  void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
>  bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
> +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
> +                              enum mem_type type, void *allocator);
>
>  #endif /* __LINUX_NET_XDP_H__ */
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index a4bb0b34375a..3e4bbcbe3e86 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -19,6 +19,7 @@
>  #include <linux/bpf.h>
>  #include <linux/filter.h>
>  #include <linux/ptr_ring.h>
> +#include <net/xdp.h>
>
>  #include <linux/sched.h>
>  #include <linux/workqueue.h>
> @@ -137,27 +138,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>         return ERR_PTR(err);
>  }
>
> -static void __cpu_map_queue_destructor(void *ptr)
> -{
> -       /* The tear-down procedure should have made sure that queue is
> -        * empty.  See __cpu_map_entry_replace() and work-queue
> -        * invoked cpu_map_kthread_stop(). Catch any broken behaviour
> -        * gracefully and warn once.
> -        */
> -       if (WARN_ON_ONCE(ptr))
> -               page_frag_free(ptr);
> -}
> -
> -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> -{
> -       if (atomic_dec_and_test(&rcpu->refcnt)) {
> -               /* The queue should be empty at this point */
> -               ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
> -               kfree(rcpu->queue);
> -               kfree(rcpu);
> -       }
> -}
> -
>  static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
>  {
>         atomic_inc(&rcpu->refcnt);
> @@ -188,6 +168,10 @@ struct xdp_pkt {
>         u16 len;
>         u16 headroom;
>         u16 metasize;
> +       /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
> +        * while mem info is valid on remote CPU.
> +        */
> +       struct xdp_mem_info mem;
>         struct net_device *dev_rx;
>  };
>
> @@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
>         xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
>         xdp_pkt->metasize = metasize;
>
> +       /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
> +       xdp_pkt->mem = xdp->rxq->mem;
> +
>         return xdp_pkt;
>  }
>
> @@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
>         return skb;
>  }
>
> +static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
> +{
> +       /* The tear-down procedure should have made sure that queue is
> +        * empty.  See __cpu_map_entry_replace() and work-queue
> +        * invoked cpu_map_kthread_stop(). Catch any broken behaviour
> +        * gracefully and warn once.
> +        */
> +       struct xdp_pkt *xdp_pkt;
> +
> +       while ((xdp_pkt = ptr_ring_consume(ring)))
> +               if (WARN_ON_ONCE(xdp_pkt))
> +                       xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
> +}
> +
> +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> +{
> +       if (atomic_dec_and_test(&rcpu->refcnt)) {
> +               /* The queue should be empty at this point */
> +               __cpu_map_ring_cleanup(rcpu->queue);
> +               ptr_ring_cleanup(rcpu->queue, NULL);
> +               kfree(rcpu->queue);
> +               kfree(rcpu);
> +       }
> +}
> +
>  static int cpu_map_kthread_run(void *data)
>  {
>         struct bpf_cpu_map_entry *rcpu = data;
> @@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data)
>
>                         skb = cpu_map_build_skb(rcpu, xdp_pkt);
>                         if (!skb) {
> -                               page_frag_free(xdp_pkt);
> +                               xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
>                                 continue;
>                         }
>
> @@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
>         spin_lock(&q->producer_lock);
>
>         for (i = 0; i < bq->count; i++) {
> -               void *xdp_pkt = bq->q[i];
> +               struct xdp_pkt *xdp_pkt = bq->q[i];
>                 int err;
>
>                 err = __ptr_ring_produce(q, xdp_pkt);
>                 if (err) {
>                         drops++;
> -                       page_frag_free(xdp_pkt); /* Free xdp_pkt */
> +                       xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
>                 }
>                 processed++;
>         }
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 097a0f74e004..9eee0c431126 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
>         return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
>  }
>  EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
> +
> +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
> +                              enum mem_type type, void *allocator)
> +{
> +       if (type >= MEM_TYPE_MAX)
> +               return -EINVAL;
> +
> +       xdp_rxq->mem.type = type;
> +
> +       if (allocator)
> +               return -EOPNOTSUPP;
> +
> +       /* TODO: Allocate an ID that maps to allocator pointer
> +        * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
> +        */
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
>