lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <14d20c4b8e304ee09f8cb76f5981a526@huawei.com>
Date: Mon, 27 Jan 2025 02:55:56 +0000
From: lizetao <lizetao1@...wei.com>
To: David Wei <dw@...idwei.uk>, "io-uring@...r.kernel.org"
	<io-uring@...r.kernel.org>, "netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC: Jens Axboe <axboe@...nel.dk>, Pavel Begunkov <asml.silence@...il.com>,
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, "David S.
 Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, "Jesper
 Dangaard Brouer" <hawk@...nel.org>, David Ahern <dsahern@...nel.org>, "Mina
 Almasry" <almasrymina@...gle.com>, Stanislav Fomichev <stfomichev@...il.com>,
	Joe Damato <jdamato@...tly.com>, Pedro Tammela <pctammela@...atatu.com>
Subject: RE: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area

Hi,

> -----Original Message-----
> From: David Wei <dw@...idwei.uk>
> Sent: Friday, January 17, 2025 7:17 AM
> To: io-uring@...r.kernel.org; netdev@...r.kernel.org
> Cc: Jens Axboe <axboe@...nel.dk>; Pavel Begunkov <asml.silence@...il.com>;
> Jakub Kicinski <kuba@...nel.org>; Paolo Abeni <pabeni@...hat.com>; David S.
> Miller <davem@...emloft.net>; Eric Dumazet <edumazet@...gle.com>;
> Jesper Dangaard Brouer <hawk@...nel.org>; David Ahern
> <dsahern@...nel.org>; Mina Almasry <almasrymina@...gle.com>; Stanislav
> Fomichev <stfomichev@...il.com>; Joe Damato <jdamato@...tly.com>;
> Pedro Tammela <pctammela@...atatu.com>
> Subject: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area
> 
> Add io_zcrx_area that represents a region of userspace memory that is used for
> zero copy. During ifq registration, userspace passes in the uaddr and len of
> userspace memory, which is then pinned by the kernel.
> Each net_iov is mapped to one of these pages.
> 
> The freelist is a spinlock protected list that keeps track of all the net_iovs/pages
> that aren't used.
> 
> For now, there is only one area per ifq and area registration happens implicitly
> as part of ifq registration. There is no API for adding/removing areas yet. The
> struct for area registration is there for future extensibility once we support
> multiple areas and TCP devmem.
> 
> Reviewed-by: Jens Axboe <axboe@...nel.dk>
> Signed-off-by: Pavel Begunkov <asml.silence@...il.com>
> Signed-off-by: David Wei <dw@...idwei.uk>
> ---
>  include/uapi/linux/io_uring.h |  9 ++++
>  io_uring/rsrc.c               |  2 +-
>  io_uring/rsrc.h               |  1 +
>  io_uring/zcrx.c               | 89
> ++++++++++++++++++++++++++++++++++-
>  io_uring/zcrx.h               | 16 +++++++
>  5 files changed, 114 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index
> 3af8b7a19824..e251f28507ce 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -980,6 +980,15 @@ struct io_uring_zcrx_offsets {
>  	__u64	__resv[2];
>  };
> 
> +struct io_uring_zcrx_area_reg {
> +	__u64	addr;
> +	__u64	len;
> +	__u64	rq_area_token;
> +	__u32	flags;
> +	__u32	__resv1;
> +	__u64	__resv2[2];
> +};
> +
>  /*
>   * Argument for IORING_REGISTER_ZCRX_IFQ
>   */
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..d0f11b5aec0d
> 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx,
> unsigned long nr_pages)
>  	return 0;
>  }
> 
> -static int io_buffer_validate(struct iovec *iov)
> +int io_buffer_validate(struct iovec *iov)
>  {
>  	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
> 
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8b093584461..0ae54ddeb1fd
> 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -66,6 +66,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void
> __user *arg,
>  			    unsigned size, unsigned type);
>  int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
>  			unsigned int size, unsigned int type);
> +int io_buffer_validate(struct iovec *iov);
> 
>  bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
>  			      struct io_imu_folio_data *data); diff --git
> a/io_uring/zcrx.c b/io_uring/zcrx.c index f3ace7e8264d..04883a3ae80c 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -10,6 +10,7 @@
>  #include "kbuf.h"
>  #include "memmap.h"
>  #include "zcrx.h"
> +#include "rsrc.h"
> 
>  #define IO_RQ_MAX_ENTRIES		32768
> 
> @@ -44,6 +45,79 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
>  	ifq->rqes = NULL;
>  }
> 
> +static void io_zcrx_free_area(struct io_zcrx_area *area) {
> +	kvfree(area->freelist);
> +	kvfree(area->nia.niovs);
> +	if (area->pages) {
> +		unpin_user_pages(area->pages, area->nia.num_niovs);
> +		kvfree(area->pages);
> +	}
> +	kfree(area);
> +}
> +
> +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
> +			       struct io_zcrx_area **res,
> +			       struct io_uring_zcrx_area_reg *area_reg) {
> +	struct io_zcrx_area *area;
> +	int i, ret, nr_pages;
> +	struct iovec iov;
> +
> +	if (area_reg->flags || area_reg->rq_area_token)
> +		return -EINVAL;
> +	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
> +		return -EINVAL;
> +	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	iov.iov_base = u64_to_user_ptr(area_reg->addr);
> +	iov.iov_len = area_reg->len;
> +	ret = io_buffer_validate(&iov);
> +	if (ret)
> +		return ret;
> +
> +	ret = -ENOMEM;
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		goto err;
> +
> +	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
> +				   &nr_pages);
> +	if (IS_ERR(area->pages)) {
> +		ret = PTR_ERR(area->pages);
> +		area->pages = NULL;
> +		goto err;
> +	}
> +	area->nia.num_niovs = nr_pages;
> +
> +	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
> +					 GFP_KERNEL | __GFP_ZERO);
> +	if (!area->nia.niovs)
> +		goto err;
> +
> +	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
> +					GFP_KERNEL | __GFP_ZERO);
> +	if (!area->freelist)
> +		goto err;
> +
> +	for (i = 0; i < nr_pages; i++)
> +		area->freelist[i] = i;

This is redundant as patch 14 will reinitialize it.
> +
> +	area->free_count = nr_pages;
> +	area->ifq = ifq;
> +	/* we're only supporting one area per ifq for now */
> +	area->area_id = 0;
> +	area_reg->rq_area_token = (u64)area->area_id <<
> IORING_ZCRX_AREA_SHIFT;
> +	spin_lock_init(&area->freelist_lock);
> +	*res = area;
> +	return 0;
> +err:
> +	if (area)
> +		io_zcrx_free_area(area);
> +	return ret;
> +}
> +
>  static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)  {
>  	struct io_zcrx_ifq *ifq;
> @@ -59,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct
> io_ring_ctx *ctx)
> 
>  static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)  {
> +	if (ifq->area)
> +		io_zcrx_free_area(ifq->area);
> +
>  	io_free_rbuf_ring(ifq);
>  	kfree(ifq);
>  }
> @@ -66,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)  int
> io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  			  struct io_uring_zcrx_ifq_reg __user *arg)  {
> +	struct io_uring_zcrx_area_reg area;
>  	struct io_uring_zcrx_ifq_reg reg;
>  	struct io_uring_region_desc rd;
>  	struct io_zcrx_ifq *ifq;
> @@ -99,7 +177,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  	}
>  	reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
> 
> -	if (!reg.area_ptr)
> +	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr),
> +sizeof(area)))
>  		return -EFAULT;
> 
>  	ifq = io_zcrx_ifq_alloc(ctx);
> @@ -110,6 +188,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  	if (ret)
>  		goto err;
> 
> +	ret = io_zcrx_create_area(ifq, &ifq->area, &area);
> +	if (ret)
> +		goto err;
> +
>  	ifq->rq_entries = reg.rq_entries;
>  	ifq->if_rxq = reg.if_rxq;
> 
> @@ -122,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  		ret = -EFAULT;
>  		goto err;
>  	}
> -
> +	if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
> +		ret = -EFAULT;
> +		goto err;
> +	}
>  	ctx->ifq = ifq;
>  	return 0;
>  err:
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index
> 58e4ab6c6083..53fd94b65b38 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -3,9 +3,25 @@
>  #define IOU_ZC_RX_H
> 
>  #include <linux/io_uring_types.h>
> +#include <net/page_pool/types.h>
> +
> +struct io_zcrx_area {
> +	struct net_iov_area	nia;
> +	struct io_zcrx_ifq	*ifq;
> +
> +	u16			area_id;
> +	struct page		**pages;
> +
> +	/* freelist */
> +	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
> +	u32			free_count;
> +	u32			*freelist;
> +};
> 
>  struct io_zcrx_ifq {
>  	struct io_ring_ctx		*ctx;
> +	struct io_zcrx_area		*area;
> +
>  	struct io_uring			*rq_ring;
>  	struct io_uring_zcrx_rqe	*rqes;
>  	u32				rq_entries;
> --
> 2.43.5
> 
> 

--
Li Zetao

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ