lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0099021f-9185-69c1-3e63-64afdba988cf@gmail.com>
Date:   Mon, 7 Nov 2022 10:33:41 -0800
From:   Eric Dumazet <eric.dumazet@...il.com>
To:     Stefan Roesch <shr@...kernel.io>, kernel-team@...com
Cc:     axboe@...nel.dk, olivier@...llion01.com, netdev@...r.kernel.org,
        io-uring@...r.kernel.org, kuba@...nel.org
Subject: Re: [RFC PATCH v2 1/2] io_uring: add napi busy polling support


On 11/7/22 09:52, Stefan Roesch wrote:
> This adds the napi busy polling support in io_uring.c. It adds a new
> napi_list to the io_ring_ctx structure. This list contains the list of
> napi_id's that are currently enabled for busy polling. The list is
> synchronized by the new napi_lock spin lock. The current default napi
> busy polling time is stored in napi_busy_poll_to. If napi busy polling
> is not enabled, the value is 0.
>
> The busy poll timeout is also stored as part of the io_wait_queue. This
> is necessary as for sq polling the poll interval needs to be adjusted
> and the napi callback allows only to pass in one value.
>
> Testing has shown that the round-trip times are reduced to 38us from
> 55us by enabling napi busy polling with a busy poll timeout of 100us.
>
> Signed-off-by: Stefan Roesch <shr@...kernel.io>
> Suggested-by: Olivier Langlois <olivier@...llion01.com>
> ---
>   include/linux/io_uring_types.h |   6 +
>   io_uring/io_uring.c            | 240 +++++++++++++++++++++++++++++++++
>   io_uring/napi.h                |  22 +++
>   io_uring/poll.c                |   3 +
>   io_uring/sqpoll.c              |   9 ++
>   5 files changed, 280 insertions(+)
>   create mode 100644 io_uring/napi.h
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f5b687a787a3..84b446b0d215 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -270,6 +270,12 @@ struct io_ring_ctx {
>   	struct xarray		personalities;
>   	u32			pers_next;
>   
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +	struct list_head	napi_list;	/* track busy poll napi_id */
> +	spinlock_t		napi_lock;	/* napi_list lock */
> +	unsigned int		napi_busy_poll_to; /* napi busy poll default timeout */
> +#endif
> +
>   	struct {
>   		/*
>   		 * We cache a range of free CQEs we can use, once exhausted it
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ac8c488e3077..b02bba4ebcbf 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -90,6 +90,7 @@
>   #include "rsrc.h"
>   #include "cancel.h"
>   #include "net.h"
> +#include "napi.h"
>   #include "notif.h"
>   
>   #include "timeout.h"
> @@ -327,6 +328,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   	INIT_WQ_LIST(&ctx->locked_free_list);
>   	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
>   	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
> +
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +	INIT_LIST_HEAD(&ctx->napi_list);
> +	spin_lock_init(&ctx->napi_lock);
> +	ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
> +#endif
> +
>   	return ctx;
>   err:
>   	kfree(ctx->dummy_ubuf);
> @@ -2303,6 +2311,10 @@ struct io_wait_queue {
>   	struct io_ring_ctx *ctx;
>   	unsigned cq_tail;
>   	unsigned nr_timeouts;
> +
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +	unsigned int busy_poll_to;
> +#endif
>   };
>   
>   static inline bool io_has_work(struct io_ring_ctx *ctx)
> @@ -2376,6 +2388,198 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>   	return 1;
>   }
>   
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +#define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
> +
> +struct io_napi_entry {
> +	struct list_head	list;
> +	unsigned int		napi_id;
> +	unsigned long		timeout;
> +};
> +
> +static bool io_napi_busy_loop_on(struct io_ring_ctx *ctx)
> +{
> +	return READ_ONCE(ctx->napi_busy_poll_to);
> +}
> +
> +/*
> + * io_napi_add() - Add napi id to the busy poll list
> + * @file: file pointer for socket
> + * @ctx:  io-uring context
> + *
> + * Add the napi id of the socket to the napi busy poll list.
> + */
> +void io_napi_add(struct file *file, struct io_ring_ctx *ctx)
> +{
> +	unsigned int napi_id;
> +	struct socket *sock;
> +	struct sock *sk;
> +	struct io_napi_entry *ne;
> +
> +	if (!io_napi_busy_loop_on(ctx))
> +		return;
> +
> +	sock = sock_from_file(file);
> +	if (!sock)
> +		return;
> +
> +	sk = sock->sk;
> +	if (!sk)
> +		return;
> +
> +	napi_id = READ_ONCE(sk->sk_napi_id);
> +
> +	/* Non-NAPI IDs can be rejected */
> +	if (napi_id < MIN_NAPI_ID)
> +		return;
> +
> +	spin_lock(&ctx->napi_lock);
> +	list_for_each_entry(ne, &ctx->napi_list, list) {
> +		if (ne->napi_id == napi_id) {
> +			ne->timeout = jiffies + NAPI_TIMEOUT;
> +			goto out;
> +		}

This list could become very big, if you do not remove stale napi_id from it.

Device reconfiguration do not recycle napi_id, it creates new ones.


> +	}
> +
> +	ne = kmalloc(sizeof(*ne), GFP_NOWAIT);
> +	if (!ne)
> +		goto out;
> +
> +	ne->napi_id = napi_id;
> +	ne->timeout = jiffies + NAPI_TIMEOUT;
> +	list_add_tail(&ne->list, &ctx->napi_list);
> +
> +out:
> +	spin_unlock(&ctx->napi_lock);
> +}
> +
> +

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ