[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <14d20c4b8e304ee09f8cb76f5981a526@huawei.com>
Date: Mon, 27 Jan 2025 02:55:56 +0000
From: lizetao <lizetao1@...wei.com>
To: David Wei <dw@...idwei.uk>, "io-uring@...r.kernel.org"
<io-uring@...r.kernel.org>, "netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC: Jens Axboe <axboe@...nel.dk>, Pavel Begunkov <asml.silence@...il.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, "David S.
Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, "Jesper
Dangaard Brouer" <hawk@...nel.org>, David Ahern <dsahern@...nel.org>, "Mina
Almasry" <almasrymina@...gle.com>, Stanislav Fomichev <stfomichev@...il.com>,
Joe Damato <jdamato@...tly.com>, Pedro Tammela <pctammela@...atatu.com>
Subject: RE: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area
Hi,
> -----Original Message-----
> From: David Wei <dw@...idwei.uk>
> Sent: Friday, January 17, 2025 7:17 AM
> To: io-uring@...r.kernel.org; netdev@...r.kernel.org
> Cc: Jens Axboe <axboe@...nel.dk>; Pavel Begunkov <asml.silence@...il.com>;
> Jakub Kicinski <kuba@...nel.org>; Paolo Abeni <pabeni@...hat.com>; David S.
> Miller <davem@...emloft.net>; Eric Dumazet <edumazet@...gle.com>;
> Jesper Dangaard Brouer <hawk@...nel.org>; David Ahern
> <dsahern@...nel.org>; Mina Almasry <almasrymina@...gle.com>; Stanislav
> Fomichev <stfomichev@...il.com>; Joe Damato <jdamato@...tly.com>;
> Pedro Tammela <pctammela@...atatu.com>
> Subject: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area
>
> Add io_zcrx_area that represents a region of userspace memory that is used for
> zero copy. During ifq registration, userspace passes in the uaddr and len of
> userspace memory, which is then pinned by the kernel.
> Each net_iov is mapped to one of these pages.
>
> The freelist is a spinlock protected list that keeps track of all the net_iovs/pages
> that aren't used.
>
> For now, there is only one area per ifq and area registration happens implicitly
> as part of ifq registration. There is no API for adding/removing areas yet. The
> struct for area registration is there for future extensibility once we support
> multiple areas and TCP devmem.
>
> Reviewed-by: Jens Axboe <axboe@...nel.dk>
> Signed-off-by: Pavel Begunkov <asml.silence@...il.com>
> Signed-off-by: David Wei <dw@...idwei.uk>
> ---
> include/uapi/linux/io_uring.h | 9 ++++
> io_uring/rsrc.c | 2 +-
> io_uring/rsrc.h | 1 +
> io_uring/zcrx.c | 89
> ++++++++++++++++++++++++++++++++++-
> io_uring/zcrx.h | 16 +++++++
> 5 files changed, 114 insertions(+), 3 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index
> 3af8b7a19824..e251f28507ce 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -980,6 +980,15 @@ struct io_uring_zcrx_offsets {
> __u64 __resv[2];
> };
>
> +struct io_uring_zcrx_area_reg {
> + __u64 addr;
> + __u64 len;
> + __u64 rq_area_token;
> + __u32 flags;
> + __u32 __resv1;
> + __u64 __resv2[2];
> +};
> +
> /*
> * Argument for IORING_REGISTER_ZCRX_IFQ
> */
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..d0f11b5aec0d
> 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx,
> unsigned long nr_pages)
> return 0;
> }
>
> -static int io_buffer_validate(struct iovec *iov)
> +int io_buffer_validate(struct iovec *iov)
> {
> unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8b093584461..0ae54ddeb1fd
> 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -66,6 +66,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void
> __user *arg,
> unsigned size, unsigned type);
> int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
> unsigned int size, unsigned int type);
> +int io_buffer_validate(struct iovec *iov);
>
> bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> struct io_imu_folio_data *data); diff --git
> a/io_uring/zcrx.c b/io_uring/zcrx.c index f3ace7e8264d..04883a3ae80c 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -10,6 +10,7 @@
> #include "kbuf.h"
> #include "memmap.h"
> #include "zcrx.h"
> +#include "rsrc.h"
>
> #define IO_RQ_MAX_ENTRIES 32768
>
> @@ -44,6 +45,79 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
> ifq->rqes = NULL;
> }
>
> +static void io_zcrx_free_area(struct io_zcrx_area *area) {
> + kvfree(area->freelist);
> + kvfree(area->nia.niovs);
> + if (area->pages) {
> + unpin_user_pages(area->pages, area->nia.num_niovs);
> + kvfree(area->pages);
> + }
> + kfree(area);
> +}
> +
> +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
> + struct io_zcrx_area **res,
> + struct io_uring_zcrx_area_reg *area_reg) {
> + struct io_zcrx_area *area;
> + int i, ret, nr_pages;
> + struct iovec iov;
> +
> + if (area_reg->flags || area_reg->rq_area_token)
> + return -EINVAL;
> + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
> + return -EINVAL;
> + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
> + return -EINVAL;
> +
> + iov.iov_base = u64_to_user_ptr(area_reg->addr);
> + iov.iov_len = area_reg->len;
> + ret = io_buffer_validate(&iov);
> + if (ret)
> + return ret;
> +
> + ret = -ENOMEM;
> + area = kzalloc(sizeof(*area), GFP_KERNEL);
> + if (!area)
> + goto err;
> +
> + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
> + &nr_pages);
> + if (IS_ERR(area->pages)) {
> + ret = PTR_ERR(area->pages);
> + area->pages = NULL;
> + goto err;
> + }
> + area->nia.num_niovs = nr_pages;
> +
> + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
> + GFP_KERNEL | __GFP_ZERO);
> + if (!area->nia.niovs)
> + goto err;
> +
> + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
> + GFP_KERNEL | __GFP_ZERO);
> + if (!area->freelist)
> + goto err;
> +
> + for (i = 0; i < nr_pages; i++)
> + area->freelist[i] = i;
This is redundant as patch 14 will reinitialize it.
> +
> + area->free_count = nr_pages;
> + area->ifq = ifq;
> + /* we're only supporting one area per ifq for now */
> + area->area_id = 0;
> + area_reg->rq_area_token = (u64)area->area_id <<
> IORING_ZCRX_AREA_SHIFT;
> + spin_lock_init(&area->freelist_lock);
> + *res = area;
> + return 0;
> +err:
> + if (area)
> + io_zcrx_free_area(area);
> + return ret;
> +}
> +
> static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) {
> struct io_zcrx_ifq *ifq;
> @@ -59,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct
> io_ring_ctx *ctx)
>
> static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) {
> + if (ifq->area)
> + io_zcrx_free_area(ifq->area);
> +
> io_free_rbuf_ring(ifq);
> kfree(ifq);
> }
> @@ -66,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) int
> io_register_zcrx_ifq(struct io_ring_ctx *ctx,
> struct io_uring_zcrx_ifq_reg __user *arg) {
> + struct io_uring_zcrx_area_reg area;
> struct io_uring_zcrx_ifq_reg reg;
> struct io_uring_region_desc rd;
> struct io_zcrx_ifq *ifq;
> @@ -99,7 +177,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
> }
> reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
>
> - if (!reg.area_ptr)
> + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr),
> +sizeof(area)))
> return -EFAULT;
>
> ifq = io_zcrx_ifq_alloc(ctx);
> @@ -110,6 +188,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
> if (ret)
> goto err;
>
> + ret = io_zcrx_create_area(ifq, &ifq->area, &area);
> + if (ret)
> + goto err;
> +
> ifq->rq_entries = reg.rq_entries;
> ifq->if_rxq = reg.if_rxq;
>
> @@ -122,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
> ret = -EFAULT;
> goto err;
> }
> -
> + if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
> + ret = -EFAULT;
> + goto err;
> + }
> ctx->ifq = ifq;
> return 0;
> err:
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index
> 58e4ab6c6083..53fd94b65b38 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -3,9 +3,25 @@
> #define IOU_ZC_RX_H
>
> #include <linux/io_uring_types.h>
> +#include <net/page_pool/types.h>
> +
> +struct io_zcrx_area {
> + struct net_iov_area nia;
> + struct io_zcrx_ifq *ifq;
> +
> + u16 area_id;
> + struct page **pages;
> +
> + /* freelist */
> + spinlock_t freelist_lock ____cacheline_aligned_in_smp;
> + u32 free_count;
> + u32 *freelist;
> +};
>
> struct io_zcrx_ifq {
> struct io_ring_ctx *ctx;
> + struct io_zcrx_area *area;
> +
> struct io_uring *rq_ring;
> struct io_uring_zcrx_rqe *rqes;
> u32 rq_entries;
> --
> 2.43.5
>
>
--
Li Zetao
Powered by blists - more mailing lists