[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJ+HfNjC6Rd_eK3Uw=QynfhEc1n8Bm-3RZrTywXMMeTWjNsMjw@mail.gmail.com>
Date: Mon, 24 Jun 2019 17:29:18 +0200
From: Björn Töpel <bjorn.topel@...il.com>
To: Kevin Laatz <kevin.laatz@...el.com>
Cc: Netdev <netdev@...r.kernel.org>,
Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Björn Töpel <bjorn.topel@...el.com>,
"Karlsson, Magnus" <magnus.karlsson@...el.com>,
bpf <bpf@...r.kernel.org>,
intel-wired-lan <intel-wired-lan@...ts.osuosl.org>,
Bruce Richardson <bruce.richardson@...el.com>,
ciara.loftus@...el.com
Subject: Re: [PATCH 06/11] xsk: add support to allow unaligned chunk placement
On Thu, 20 Jun 2019 at 19:25, Kevin Laatz <kevin.laatz@...el.com> wrote:
>
> Currently, addresses are chunk size aligned. This means, we are very
> restricted in terms of where we can place chunk within the umem. For
> example, if we have a chunk size of 2k, then our chunks can only be placed
> at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
>
> This patch introduces the ability to use unaligned chunks. With these
> changes, we are no longer bound to having to place chunks at a 2k (or
> whatever your chunk size is) interval. Since we are no longer dealing with
> aligned chunks, they can now cross page boundaries. Checks for page
> contiguity have been added in order to keep track of which pages are
> followed by a physically contiguous page.
>
> Signed-off-by: Kevin Laatz <kevin.laatz@...el.com>
> Signed-off-by: Ciara Loftus <ciara.loftus@...el.com>
> ---
> include/net/xdp_sock.h | 2 ++
> include/uapi/linux/if_xdp.h | 4 +++
> net/xdp/xdp_umem.c | 17 +++++++----
> net/xdp/xsk.c | 60 ++++++++++++++++++++++++++++++++-----
> net/xdp/xsk_queue.h | 60 ++++++++++++++++++++++++++++++++-----
> 5 files changed, 121 insertions(+), 22 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index ae0f368a62bb..c07977e271c4 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -19,6 +19,7 @@ struct xsk_queue;
> struct xdp_umem_page {
> void *addr;
> dma_addr_t dma;
> + bool next_pg_contig;
> };
>
> struct xdp_umem_fq_reuse {
> @@ -48,6 +49,7 @@ struct xdp_umem {
> bool zc;
> spinlock_t xsk_list_lock;
> struct list_head xsk_list;
> + u32 flags;
> };
>
> struct xdp_sock {
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index caed8b1614ff..8548f2110a77 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -17,6 +17,9 @@
> #define XDP_COPY (1 << 1) /* Force copy-mode */
> #define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
>
> +/* Flags for xsk_umem_config flags */
> +#define XDP_UMEM_UNALIGNED_CHUNKS (1 << 0)
> +
> struct sockaddr_xdp {
> __u16 sxdp_family;
> __u16 sxdp_flags;
> @@ -52,6 +55,7 @@ struct xdp_umem_reg {
> __u64 len; /* Length of packet data area */
> __u32 chunk_size;
> __u32 headroom;
> + __u32 flags;
> };
>
> struct xdp_statistics {
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 2b18223e7eb8..4d4782572855 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -301,12 +301,14 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
>
> static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> {
> + bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNKS;
You need to validate the flags field from userland, i.e. check for
unknown flags.
Also, headroom != 0 doesn't make sense in the unaligned chunks case,
right? Perhaps add a check that rejects headroom != 0 for the unaliged
case?
> u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
> unsigned int chunks, chunks_per_page;
> u64 addr = mr->addr, size = mr->len;
> int size_chk, err, i;
>
> - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
> + if ((!unaligned_chunks && chunk_size < XDP_UMEM_MIN_CHUNK_SIZE) ||
Hmm, so the chunk_size is the "buffer length" in the unaligned chunks
case. Any reason to relax (and allow) chunk_size==0? Is there a need
for the extra !unaligned_chunks check here?
> + chunk_size > PAGE_SIZE) {
> /* Strictly speaking we could support this, if:
> * - huge pages, or*
> * - using an IOMMU, or
> @@ -316,7 +318,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> return -EINVAL;
> }
>
> - if (!is_power_of_2(chunk_size))
> + if (!unaligned_chunks && !is_power_of_2(chunk_size))
> return -EINVAL;
>
> if (!PAGE_ALIGNED(addr)) {
> @@ -333,9 +335,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> if (chunks == 0)
> return -EINVAL;
>
> - chunks_per_page = PAGE_SIZE / chunk_size;
> - if (chunks < chunks_per_page || chunks % chunks_per_page)
> - return -EINVAL;
> + if (!unaligned_chunks) {
> + chunks_per_page = PAGE_SIZE / chunk_size;
> + if (chunks < chunks_per_page || chunks % chunks_per_page)
> + return -EINVAL;
> + }
>
> headroom = ALIGN(headroom, 64);
>
> @@ -344,13 +348,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> return -EINVAL;
>
> umem->address = (unsigned long)addr;
> - umem->chunk_mask = ~((u64)chunk_size - 1);
> + umem->chunk_mask = unaligned_chunks ? U64_MAX : ~((u64)chunk_size - 1);
Now that chunk_mask can be "invalid", xsk_diag.c needs a change as
well, since the mask is used to get the size. I suggest using
chunk_size_nohr and headroom in xsk_diag.c
Related, are you using chunk_mask at all in the unaligned-paths
(probably, right)?
> umem->size = size;
> umem->headroom = headroom;
> umem->chunk_size_nohr = chunk_size - headroom;
> umem->npgs = size / PAGE_SIZE;
> umem->pgs = NULL;
> umem->user = NULL;
> + umem->flags = mr->flags;
> INIT_LIST_HEAD(&umem->xsk_list);
> spin_lock_init(&umem->xsk_list_lock);
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index a14e8864e4fa..dfae19942b70 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -39,7 +39,7 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
>
> u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
> {
> - return xskq_peek_addr(umem->fq, addr);
> + return xskq_peek_addr(umem->fq, addr, umem);
> }
> EXPORT_SYMBOL(xsk_umem_peek_addr);
>
> @@ -49,14 +49,36 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
> }
> EXPORT_SYMBOL(xsk_umem_discard_addr);
>
> +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
> + * each page. This is only required in copy mode.
> + */
> +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
> + u32 len, u32 metalen)
> +{
> + void *to_buf = xdp_umem_get_data(umem, addr);
> +
> + if (xskq_crosses_non_contig_pg(umem, addr)) {
> + void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT)+1].addr;
> + u64 page_start = addr & (PAGE_SIZE-1);
> + u64 first_len = PAGE_SIZE - (addr - page_start);
> +
> + memcpy(to_buf, from_buf, first_len + metalen);
> + memcpy(next_pg_addr, from_buf + first_len, len - first_len);
> +
> + return;
> + }
> +
> + memcpy(to_buf, from_buf, len + metalen);
> +}
> +
> static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> {
> - void *to_buf, *from_buf;
> + void *from_buf;
> u32 metalen;
> u64 addr;
> int err;
>
> - if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
> len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
> xs->rx_dropped++;
> return -ENOSPC;
> @@ -72,8 +94,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> metalen = xdp->data - xdp->data_meta;
> }
>
> - to_buf = xdp_umem_get_data(xs->umem, addr);
> - memcpy(to_buf, from_buf, len + metalen);
> + __xsk_rcv_memcpy(xs->umem, addr, from_buf, len, metalen);
> +
> addr += metalen;
> err = xskq_produce_batch_desc(xs->rx, addr, len);
> if (!err) {
> @@ -126,7 +148,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
> if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
> return -EINVAL;
>
> - if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
> len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
> xs->rx_dropped++;
> return -ENOSPC;
> @@ -173,7 +195,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
>
> rcu_read_lock();
> list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
> - if (!xskq_peek_desc(xs->tx, &desc))
> + if (!xskq_peek_desc(xs->tx, &desc, umem))
> continue;
>
> if (xskq_produce_addr_lazy(umem->cq, desc.addr))
> @@ -226,7 +248,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
>
> mutex_lock(&xs->mutex);
>
> - while (xskq_peek_desc(xs->tx, &desc)) {
> + while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
> char *buffer;
> u64 addr;
> u32 len;
> @@ -393,6 +415,26 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
> return sock;
> }
>
> +/* Check if umem pages are contiguous.
> + * If zero-copy mode, use the DMA address to do the page contiguity check
> + * For all other modes we use addr (kernel virtual address)
> + */
> +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
> +{
> + int i;
> +
> + if (flags & XDP_ZEROCOPY) {
> + for (i = 0; i < umem->npgs-1; i++)
> + umem->pages[i].next_pg_contig =
> + (umem->pages[i].dma + PAGE_SIZE == umem->pages[i+1].dma);
> + return;
> + }
> +
> + for (i = 0; i < umem->npgs-1; i++)
> + umem->pages[i].next_pg_contig =
> + (umem->pages[i].addr + PAGE_SIZE == umem->pages[i+1].addr);
> +}
> +
> static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> {
> struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
> @@ -480,6 +522,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
> if (err)
> goto out_unlock;
> +
> + xsk_check_page_contiguity(xs->umem, flags);
> }
>
> xs->dev = dev;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 88b9ae24658d..c2676a1df938 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -119,6 +119,14 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
>
> /* UMEM queue */
>
> +static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr)
> +{
> + bool cross_pg = (addr & (PAGE_SIZE-1)) + umem->chunk_size_nohr > PAGE_SIZE;
> + bool next_pg_contig = umem->pages[(addr >> PAGE_SHIFT)+1].next_pg_contig;
> +
> + return cross_pg && !next_pg_contig;
> +}
> +
> static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
> {
> if (addr >= q->size) {
> @@ -129,23 +137,44 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
> return true;
> }
>
> -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
> +static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
> + struct xdp_umem *umem)
> +{
> + if (addr >= q->size ||
> + xskq_crosses_non_contig_pg(umem, addr)) {
> + q->invalid_descs++;
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
> + struct xdp_umem *umem)
> {
> while (q->cons_tail != q->cons_head) {
> struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> unsigned int idx = q->cons_tail & q->ring_mask;
>
> *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
> + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNKS) {
> + if (xskq_is_valid_addr_unaligned(q, *addr, umem))
> + return addr;
> + goto out;
> + }
> +
> if (xskq_is_valid_addr(q, *addr))
> return addr;
>
> +out:
> q->cons_tail++;
> }
>
> return NULL;
> }
>
> -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
> + struct xdp_umem *umem)
> {
> if (q->cons_tail == q->cons_head) {
> smp_mb(); /* D, matches A */
> @@ -156,7 +185,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> smp_rmb();
> }
>
> - return xskq_validate_addr(q, addr);
> + return xskq_validate_addr(q, addr, umem);
> }
>
> static inline void xskq_discard_addr(struct xsk_queue *q)
> @@ -215,8 +244,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
>
> /* Rx/Tx queue */
>
> -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
> +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
> + struct xdp_umem *umem)
> {
> + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNKS) {
> + if (!xskq_is_valid_addr_unaligned(q, d->addr, umem))
> + return false;
> +
> + if ((d->len > umem->chunk_size_nohr) || d->options) {
> + q->invalid_descs++;
> + return false;
> + }
> +
> + return true;
> + }
> +
> if (!xskq_is_valid_addr(q, d->addr))
> return false;
>
> @@ -230,14 +272,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
> }
>
> static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
> - struct xdp_desc *desc)
> + struct xdp_desc *desc,
> + struct xdp_umem *umem)
> {
> while (q->cons_tail != q->cons_head) {
> struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
> unsigned int idx = q->cons_tail & q->ring_mask;
>
> *desc = READ_ONCE(ring->desc[idx]);
> - if (xskq_is_valid_desc(q, desc))
> + if (xskq_is_valid_desc(q, desc, umem))
> return desc;
>
> q->cons_tail++;
> @@ -247,7 +290,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
> }
>
> static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> - struct xdp_desc *desc)
> + struct xdp_desc *desc,
> + struct xdp_umem *umem)
> {
> if (q->cons_tail == q->cons_head) {
> smp_mb(); /* D, matches A */
> @@ -258,7 +302,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> smp_rmb(); /* C, matches B */
> }
>
> - return xskq_validate_desc(q, desc);
> + return xskq_validate_desc(q, desc, umem);
> }
>
> static inline void xskq_discard_desc(struct xsk_queue *q)
> --
> 2.17.1
>
Powered by blists - more mailing lists