[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <9bd285d7-78cd-ff13-4e89-eb36ad478780@grimberg.me>
Date: Mon, 14 Aug 2023 10:32:25 +0300
From: Sagi Grimberg <sagi@...mberg.me>
To: Hannes Reinecke <hare@...e.de>, Christoph Hellwig <hch@....de>
Cc: Keith Busch <kbusch@...nel.org>, linux-nvme@...ts.infradead.org,
Jakub Kicinski <kuba@...nel.org>, Eric Dumazet <edumazet@...gle.com>,
Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org
Subject: Re: [PATCH 14/17] nvmet-tcp: reference counting for queues
On 8/13/23 17:38, Hannes Reinecke wrote:
> On 8/13/23 16:01, Sagi Grimberg wrote:
>>
>>
>> On 8/11/23 15:17, Hannes Reinecke wrote:
>>> The 'queue' structure is referenced from various places and
>>> used as an argument of asynchronous functions, so it's really
>>> hard to figure out if the queue is still valid when the
>>> asynchronous function triggers.
>>> So add reference counting to validate the queue structure.
>>>
>>> Signed-off-by: Hannes Reinecke <hare@...e.de>
>>> ---
>>> drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
>>> 1 file changed, 57 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>>> index f19ea9d923fd..84b726dfc1c4 100644
>>> --- a/drivers/nvme/target/tcp.c
>>> +++ b/drivers/nvme/target/tcp.c
>>> @@ -127,6 +127,7 @@ enum nvmet_tcp_queue_state {
>>> };
>>> struct nvmet_tcp_queue {
>>> + struct kref kref;
>>> struct socket *sock;
>>> struct nvmet_tcp_port *port;
>>> struct work_struct io_work;
>>> @@ -192,6 +193,9 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>> static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>> static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>> static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_data_ready(struct sock *sk);
>>> static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>> struct nvmet_tcp_cmd *cmd)
>>> @@ -1437,11 +1441,21 @@ static void
>>> nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>>> struct socket *sock = queue->sock;
>>> write_lock_bh(&sock->sk->sk_callback_lock);
>>> + /*
>>> + * Check if nvmet_tcp_set_queue_sock() has been called;
>>> + * if not the queue reference has not been increased
>>> + * and we're getting an refcount error on exit.
>>> + */
>>> + if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
>>> + write_unlock_bh(&sock->sk->sk_callback_lock);
>>> + return;
>>> + }
>>
>> This is really ugly I think.
>>
> Me too, but what would be the alternative?
>
>>> sock->sk->sk_data_ready = queue->data_ready;
>>> sock->sk->sk_state_change = queue->state_change;
>>> sock->sk->sk_write_space = queue->write_space;
>>> sock->sk->sk_user_data = NULL;
>>> write_unlock_bh(&sock->sk->sk_callback_lock);
>>> + nvmet_tcp_put_queue(queue);
>>> }
>>> static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue
>>> *queue)
>>> @@ -1474,6 +1488,30 @@ static void
>>> nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
>>> nvmet_tcp_free_cmd_buffers(&queue->connect);
>>> }
>>> +static void nvmet_tcp_release_queue_final(struct kref *kref)
>>> +{
>>> + struct nvmet_tcp_queue *queue = container_of(kref, struct
>>> nvmet_tcp_queue, kref);
>>> +
>>> + WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
>>> + nvmet_tcp_free_cmds(queue);
>>> + ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>> + /* ->sock will be released by fput() */
>>> + fput(queue->sock->file);
>>> + kfree(queue);
>>> +}
>>> +
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> + if (!queue)
>>> + return 0;
>>> + return kref_get_unless_zero(&queue->kref);
>>> +}
>>> +
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> + kref_put(&queue->kref, nvmet_tcp_release_queue_final);
>>> +}
>>> +
>>> static void nvmet_tcp_release_queue_work(struct work_struct *w)
>>> {
>>> struct page *page;
>>> @@ -1493,15 +1531,11 @@ static void
>>> nvmet_tcp_release_queue_work(struct work_struct *w)
>>> nvmet_sq_destroy(&queue->nvme_sq);
>>> cancel_work_sync(&queue->io_work);
>>> nvmet_tcp_free_cmd_data_in_buffers(queue);
>>> - /* ->sock will be released by fput() */
>>> - fput(queue->sock->file);
>>> - nvmet_tcp_free_cmds(queue);
>>> if (queue->hdr_digest || queue->data_digest)
>>> nvmet_tcp_free_crypto(queue);
>>> - ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>> page = virt_to_head_page(queue->pf_cache.va);
>>> __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
>>> - kfree(queue);
>>> + nvmet_tcp_put_queue(queue);
>>
>> What made you pick these vs. the others for before/after the
>> final reference?
>>
> I wanted to call 'nvmet_tcp_put_queue()' for a failed allocation
> in nvmet_tcp_alloc_queue(), and at that time the queue itself
> is not live.
> nvmet_tcp_release_queue() is only called on a live queue, so using
> that as the kref ->release() function would either limit it's
> usefulness or would require me to ensure that all calls in there
> can be made with a non-initialized argument.
>
>>> }
>>> static void nvmet_tcp_data_ready(struct sock *sk)
>>> @@ -1512,8 +1546,10 @@ static void nvmet_tcp_data_ready(struct sock *sk)
>>> read_lock_bh(&sk->sk_callback_lock);
>>> queue = sk->sk_user_data;
>>> - if (likely(queue))
>>> + if (likely(nvmet_tcp_get_queue(queue))) {
>>> queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
>>> &queue->io_work);
>>> + nvmet_tcp_put_queue(queue);
>>> + }
>>
>> No... Why?
>>
>> The shutdown code should serialize perfectly without this. Why add
>> a kref to the normal I/O path?
>>
>> I thought we'd simply move release_work to do a kref_put and take
>> an extra reference when we fire the tls handshake...
>>
> Because I feel ever so slightly unsure about using the sk_user_data
> argument. This function is completely asynchronous, and I can't really
> see how we can ensure that sk_user_data references a valid object.
> (If it were valid, why would we need to check for !queue ?)
>
> If you have lifetime guarantees that the kref isn't required, by all
> means, please tell me, and we can drop the kref thing here.
> But I guess that would imply to _not_ having to check for (!queue)
> which is fine by me, too.
Something doesn't add up here.
What I think you need to do, is add another reference just for the
tls handshake.
Then in the timeout handler you call tls_handshake_cancel():
- if you got %true back, you drop the reference and schedule
a release work
- if you got %false back, you simply ignore the timeout because
the .ta_done() was already triggered.
- in .ta_done() you drop the reference, cancel the timeout work
and then continue or remove based on the status.
btw in the queue release you should call tls_handshake_cancel() as well.
Powered by blists - more mailing lists