netdev - Re: [PATCH 14/17] nvmet-tcp: reference counting for queues

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date: Mon, 14 Aug 2023 10:32:25 +0300
From: Sagi Grimberg <sagi@...mberg.me>
To: Hannes Reinecke <hare@...e.de>, Christoph Hellwig <hch@....de>
Cc: Keith Busch <kbusch@...nel.org>, linux-nvme@...ts.infradead.org,
 Jakub Kicinski <kuba@...nel.org>, Eric Dumazet <edumazet@...gle.com>,
 Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org
Subject: Re: [PATCH 14/17] nvmet-tcp: reference counting for queues



On 8/13/23 17:38, Hannes Reinecke wrote:
> On 8/13/23 16:01, Sagi Grimberg wrote:
>>
>>
>> On 8/11/23 15:17, Hannes Reinecke wrote:
>>> The 'queue' structure is referenced from various places and
>>> used as an argument of asynchronous functions, so it's really
>>> hard to figure out if the queue is still valid when the
>>> asynchronous function triggers.
>>> So add reference counting to validate the queue structure.
>>>
>>> Signed-off-by: Hannes Reinecke <hare@...e.de>
>>> ---
>>>   drivers/nvme/target/tcp.c | 74 ++++++++++++++++++++++++++++++---------
>>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>>> index f19ea9d923fd..84b726dfc1c4 100644
>>> --- a/drivers/nvme/target/tcp.c
>>> +++ b/drivers/nvme/target/tcp.c
>>> @@ -127,6 +127,7 @@ enum nvmet_tcp_queue_state {
>>>   };
>>>   struct nvmet_tcp_queue {
>>> +    struct kref        kref;
>>>       struct socket        *sock;
>>>       struct nvmet_tcp_port    *port;
>>>       struct work_struct    io_work;
>>> @@ -192,6 +193,9 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>>   static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue);
>>> +static void nvmet_tcp_data_ready(struct sock *sk);
>>>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>>           struct nvmet_tcp_cmd *cmd)
>>> @@ -1437,11 +1441,21 @@ static void 
>>> nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
>>>       struct socket *sock = queue->sock;
>>>       write_lock_bh(&sock->sk->sk_callback_lock);
>>> +    /*
>>> +     * Check if nvmet_tcp_set_queue_sock() has been called;
>>> +     * if not the queue reference has not been increased
>>> +     * and we're getting an refcount error on exit.
>>> +     */
>>> +    if (sock->sk->sk_data_ready != nvmet_tcp_data_ready) {
>>> +        write_unlock_bh(&sock->sk->sk_callback_lock);
>>> +        return;
>>> +    }
>>
>> This is really ugly I think.
>>
> Me too, but what would be the alternative?
> 
>>>       sock->sk->sk_data_ready =  queue->data_ready;
>>>       sock->sk->sk_state_change = queue->state_change;
>>>       sock->sk->sk_write_space = queue->write_space;
>>>       sock->sk->sk_user_data = NULL;
>>>       write_unlock_bh(&sock->sk->sk_callback_lock);
>>> +    nvmet_tcp_put_queue(queue);
>>>   }
>>>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue 
>>> *queue)
>>> @@ -1474,6 +1488,30 @@ static void 
>>> nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
>>>           nvmet_tcp_free_cmd_buffers(&queue->connect);
>>>   }
>>> +static void nvmet_tcp_release_queue_final(struct kref *kref)
>>> +{
>>> +    struct nvmet_tcp_queue *queue = container_of(kref, struct 
>>> nvmet_tcp_queue, kref);
>>> +
>>> +    WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
>>> +    nvmet_tcp_free_cmds(queue);
>>> +    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>> +    /* ->sock will be released by fput() */
>>> +    fput(queue->sock->file);
>>> +    kfree(queue);
>>> +}
>>> +
>>> +static int nvmet_tcp_get_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    if (!queue)
>>> +        return 0;
>>> +    return kref_get_unless_zero(&queue->kref);
>>> +}
>>> +
>>> +static void nvmet_tcp_put_queue(struct nvmet_tcp_queue *queue)
>>> +{
>>> +    kref_put(&queue->kref, nvmet_tcp_release_queue_final);
>>> +}
>>> +
>>>   static void nvmet_tcp_release_queue_work(struct work_struct *w)
>>>   {
>>>       struct page *page;
>>> @@ -1493,15 +1531,11 @@ static void 
>>> nvmet_tcp_release_queue_work(struct work_struct *w)
>>>       nvmet_sq_destroy(&queue->nvme_sq);
>>>       cancel_work_sync(&queue->io_work);
>>>       nvmet_tcp_free_cmd_data_in_buffers(queue);
>>> -    /* ->sock will be released by fput() */
>>> -    fput(queue->sock->file);
>>> -    nvmet_tcp_free_cmds(queue);
>>>       if (queue->hdr_digest || queue->data_digest)
>>>           nvmet_tcp_free_crypto(queue);
>>> -    ida_free(&nvmet_tcp_queue_ida, queue->idx);
>>>       page = virt_to_head_page(queue->pf_cache.va);
>>>       __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
>>> -    kfree(queue);
>>> +    nvmet_tcp_put_queue(queue);
>>
>> What made you pick these vs. the others for before/after the
>> final reference?
>>
> I wanted to call 'nvmet_tcp_put_queue()' for a failed allocation
> in nvmet_tcp_alloc_queue(), and at that time the queue itself
> is not live.
> nvmet_tcp_release_queue() is only called on a live queue, so using
> that as the kref ->release() function would either limit it's
> usefulness or would require me to ensure that all calls in there
> can be made with a non-initialized argument.
> 
>>>   }
>>>   static void nvmet_tcp_data_ready(struct sock *sk)
>>> @@ -1512,8 +1546,10 @@ static void nvmet_tcp_data_ready(struct sock *sk)
>>>       read_lock_bh(&sk->sk_callback_lock);
>>>       queue = sk->sk_user_data;
>>> -    if (likely(queue))
>>> +    if (likely(nvmet_tcp_get_queue(queue))) {
>>>           queue_work_on(queue_cpu(queue), nvmet_tcp_wq, 
>>> &queue->io_work);
>>> +        nvmet_tcp_put_queue(queue);
>>> +    }
>>
>> No... Why?
>>
>> The shutdown code should serialize perfectly without this. Why add
>> a kref to the normal I/O path?
>>
>> I thought we'd simply move release_work to do a kref_put and take
>> an extra reference when we fire the tls handshake...
>>
> Because I feel ever so slightly unsure about using the sk_user_data
> argument. This function is completely asynchronous, and I can't really
> see how we can ensure that sk_user_data references a valid object.
> (If it were valid, why would we need to check for !queue ?)
> 
> If you have lifetime guarantees that the kref isn't required, by all
> means, please tell me, and we can drop the kref thing here.
> But I guess that would imply to _not_ having to check for (!queue)
> which is fine by me, too.

Something doesn't add up here.
What I think you need to do, is add another reference just for the
tls handshake.

Then in the timeout handler you call tls_handshake_cancel():
- if you got %true back, you drop the reference and schedule
a release work
- if you got %false back, you simply ignore the timeout because
the .ta_done() was already triggered.
- in .ta_done() you drop the reference, cancel the timeout work
   and then continue or remove based on the status.

btw in the queue release you should call tls_handshake_cancel() as well.