[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <9b8b57ca-83ae-43a4-84c6-33017dc81a32@grimberg.me>
Date: Thu, 18 Jul 2024 00:19:29 +0300
From: Sagi Grimberg <sagi@...mberg.me>
To: Hannes Reinecke <hare@...nel.org>, Christoph Hellwig <hch@....de>,
netdev@...r.kernel.org
Cc: Keith Busch <kbusch@...nel.org>, linux-nvme@...ts.infradead.org,
Hannes Reinecke <hare@...e.de>
Subject: Re: [PATCH 6/8] nvme-tcp: reduce callback lock contention
On 16/07/2024 10:36, Hannes Reinecke wrote:
> From: Hannes Reinecke <hare@...e.de>
>
> We have heavily queued tx and rx flows, so callbacks might happen
> at the same time. As the callbacks influence the state machine we
> really should remove contention here to not impact I/O performance.
>
> Signed-off-by: Hannes Reinecke <hare@...nel.org>
> ---
> drivers/nvme/host/tcp.c | 14 ++++++++------
> 1 file changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a758fbb3f9bb..9634c16d7bc0 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1153,28 +1153,28 @@ static void nvme_tcp_data_ready(struct sock *sk)
>
> trace_sk_data_ready(sk);
>
> - read_lock_bh(&sk->sk_callback_lock);
> - queue = sk->sk_user_data;
> + rcu_read_lock();
> + queue = rcu_dereference_sk_user_data(sk);
> if (likely(queue && queue->rd_enabled) &&
> !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) {
> queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> queue->data_ready_cnt++;
> }
> - read_unlock_bh(&sk->sk_callback_lock);
> + rcu_read_unlock();
Umm, this looks dangerous...
Please give a concrete (numeric) justification for this change, and
preferably a big fat comment
on why it is safe to do (for either .data_ready or .write_space).
Is there any precedence of another tcp ulp that does this? I'd like to
have the netdev folks
review this change. CC'ing netdev.
> }
>
> static void nvme_tcp_write_space(struct sock *sk)
> {
> struct nvme_tcp_queue *queue;
>
> - read_lock_bh(&sk->sk_callback_lock);
> - queue = sk->sk_user_data;
> + rcu_read_lock();
> + queue = rcu_dereference_sk_user_data(sk);
> if (likely(queue && sk_stream_is_writeable(sk))) {
> clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> queue->write_space_cnt++;
> }
> - read_unlock_bh(&sk->sk_callback_lock);
> + rcu_read_unlock();
> }
>
> static void nvme_tcp_state_change(struct sock *sk)
> @@ -2076,6 +2076,7 @@ static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
> sock->sk->sk_state_change = queue->state_change;
> sock->sk->sk_write_space = queue->write_space;
> write_unlock_bh(&sock->sk->sk_callback_lock);
> + synchronize_rcu();
> }
>
> static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> @@ -2115,6 +2116,7 @@ static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> queue->sock->sk->sk_ll_usec = 1;
> #endif
> write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> + synchronize_rcu();
> }
>
> static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
Powered by blists - more mailing lists