linux-kernel - Re: [PATCH vhost 1/7] vdpa/mlx5: Create direct MKEYs in parallel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <9f9a8d39-61ca-45df-a172-02831ed536fa@nvidia.com>
Date: Fri, 30 Aug 2024 13:09:46 +0200
From: Dragos Tatulea <dtatulea@...dia.com>
To: Eugenio Perez Martin <eperezma@...hat.com>
Cc: Jason Wang <jasowang@...hat.com>, Si-Wei Liu <si-wei.liu@...cle.com>,
 virtualization <virtualization@...ts.linux-foundation.org>,
 Gal Pressman <gal@...dia.com>, kvm list <kvm@...r.kernel.org>,
 linux-kernel <linux-kernel@...r.kernel.org>, Parav Pandit
 <parav@...dia.com>, Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
 Cosmin Ratiu <cratiu@...dia.com>, Michael Tsirkin <mst@...hat.com>
Subject: Re: [PATCH vhost 1/7] vdpa/mlx5: Create direct MKEYs in parallel



On 29.08.24 17:15, Eugenio Perez Martin wrote:
> On Thu, Aug 29, 2024 at 3:54 PM Dragos Tatulea <dtatulea@...dia.com> wrote:
>>
>>
>>
>> On 29.08.24 15:10, Eugenio Perez Martin wrote:
>>> On Wed, Aug 21, 2024 at 1:41 PM Dragos Tatulea <dtatulea@...dia.com> wrote:
>>>>
>>>> Use the async interface to issue MTT MKEY creation.
>>>> Extra care is taken at the allocation of FW input commands
>>>> due to the MTT tables having variable sizes depending on
>>>> MR.
>>>>
>>>> The indirect MKEY is still created synchronously at the
>>>> end as the direct MKEYs need to be filled in.
>>>>
>>>> This makes create_user_mr() 3-5x faster, depending on
>>>> the size of the MR.
>>>>
>>>> Signed-off-by: Dragos Tatulea <dtatulea@...dia.com>
>>>> Reviewed-by: Cosmin Ratiu <cratiu@...dia.com>
>>>> ---
>>>>  drivers/vdpa/mlx5/core/mr.c | 118 +++++++++++++++++++++++++++++-------
>>>>  1 file changed, 96 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
>>>> index 4758914ccf86..66e6a15f823f 100644
>>>> --- a/drivers/vdpa/mlx5/core/mr.c
>>>> +++ b/drivers/vdpa/mlx5/core/mr.c
>>>> @@ -49,17 +49,18 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt)
>>>>         }
>>>>  }
>>>>
>>>> -static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
>>>> +struct mlx5_create_mkey_mem {
>>>> +       u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)];
>>>> +       u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
>>>> +       DECLARE_FLEX_ARRAY(__be64, mtt);
>>>
>>> I may be missing something obvious, but why do we need
>>> DECLARE_FLEX_ARRAY here? My understanding is that it is only needed in
>>> special cases like uapi headers and we can use "__be64 mtt[]" here.
>>>
>> checkpatch.pl was complaining about it because in my initial version I
>> used the "[0]" version of zero length based arrays.
>>
>> My impression was that DECLARE_FLEX_ARRAY is preferred option because it
>> triggers a compiler error if the zero lenth array is not at the end of
>> the struct. But on closer inspection I see that using the right C99
>> empty brackets notation is enough to trigger this error.
>> DECLARE_FLEX_ARRAY seems to be useful for the union case.
>>
>> I will change it in a v2.
>>
>>>> +};
>>>> +
>>>> +static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
>>>> +                                 struct mlx5_vdpa_direct_mr *mr,
>>>> +                                 struct mlx5_create_mkey_mem *mem)
>>>>  {
>>>> -       int inlen;
>>>> +       void *in = &mem->in;
>>>>         void *mkc;
>>>> -       void *in;
>>>> -       int err;
>>>> -
>>>> -       inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
>>>> -       in = kvzalloc(inlen, GFP_KERNEL);
>>>> -       if (!in)
>>>> -               return -ENOMEM;
>>>>
>>>>         MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
>>>>         mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>>>> @@ -76,18 +77,25 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct
>>>>         MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
>>>>                  get_octo_len(mr->end - mr->start, mr->log_size));
>>>>         populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
>>>> -       err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
>>>> -       kvfree(in);
>>>> -       if (err) {
>>>> -               mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
>>>> -               return err;
>>>> -       }
>>>>
>>>> -       return 0;
>>>> +       MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
>>>> +       MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
>>>> +}
>>>> +
>>>> +static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
>>>> +                                struct mlx5_vdpa_direct_mr *mr,
>>>> +                                struct mlx5_create_mkey_mem *mem)
>>>> +{
>>>> +       u32 mkey_index = MLX5_GET(create_mkey_out, mem->out, mkey_index);
>>>> +
>>>> +       mr->mr = mlx5_idx_to_mkey(mkey_index);
>>>>  }
>>>>
>>>>  static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
>>>>  {
>>>> +       if (!mr->mr)
>>>> +               return;
>>>> +
>>>>         mlx5_vdpa_destroy_mkey(mvdev, mr->mr);
>>>>  }
>>>>
>>>> @@ -179,6 +187,74 @@ static int klm_byte_size(int nklms)
>>>>         return 16 * ALIGN(nklms, 4);
>>>>  }
>>>>
>>>> +static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
>>>> +{
>>>> +       struct mlx5_vdpa_async_cmd *cmds = NULL;
>>>> +       struct mlx5_vdpa_direct_mr *dmr;
>>>> +       int err = 0;
>>>> +       int i = 0;
>>>> +
>>>> +       cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL);
>>>> +       if (!cmds)
>>>> +               return -ENOMEM;
>>>
>>> Nit: this could benefit from Scope-based Cleanup Helpers [1], as it
>>> would make clear that memory is always removed at the end of the
>>> function & could prevent errors if the function is modified in the
>>> future. I'm a big fan of them so my opinion may be biased though :).
>>>
>>> It would be great to be able to remove the array members with that
>>> too, but I think the kernel doesn't have any facility for that.
>>>
>> I didn't know about those. It sounds like a good idea! I will try
>> to use them in v2.
>>
I tried for this patch: doing the DECLARE_FREE only for the cmds
array is only more confusing because the goto done still has to
happen due to cmd_mem elements being of different size. I preferred
not to mix and match.

I also tried adding a ctor/dtor with DECLARE_CLASS that wraps both
of these resources: the problem there is that the ctor can't fail
so the allocation still has to happen in this function but
we stilll need an extra struct to hold the cmds + size. Overall the
code was introducing more confusion.

However, it works very well for the next patch so I used it there
in v2.

In the future I will look at introducing more of these, they are
interesting helpers.

Thanks,
Dragos
>>>> +
>>>> +       list_for_each_entry(dmr, &mr->head, list) {
>>>> +               struct mlx5_create_mkey_mem *cmd_mem;
>>>> +               int mttlen, mttcount;
>>>> +
>>>> +               mttlen = roundup(MLX5_ST_SZ_BYTES(mtt) * dmr->nsg, 16);
>>>
>>> I don't get the roundup here, I guess it is so the driver does not
>>> send potentially uninitialized memory to the device? Maybe the 16
>>> should be a macro?
>>>
>> The roundup is a hw requirement. A define would be a good idea. Will add
>> it.
>>
>>>> +               mttcount = mttlen / sizeof(cmd_mem->mtt[0]);
>>>> +               cmd_mem = kvcalloc(1, struct_size(cmd_mem, mtt, mttcount), GFP_KERNEL);
>>>> +               if (!cmd_mem) {
>>>> +                       err = -ENOMEM;
>>>> +                       goto done;
>>>> +               }
>>>> +
>>>> +               cmds[i].out = cmd_mem->out;
>>>> +               cmds[i].outlen = sizeof(cmd_mem->out);
>>>> +               cmds[i].in = cmd_mem->in;
>>>> +               cmds[i].inlen = struct_size(cmd_mem, mtt, mttcount);
>>>> +
>>>> +               fill_create_direct_mr(mvdev, dmr, cmd_mem);
>>>> +
>>>> +               i++;
>>>> +       }
>>>> +
>>>> +       err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs);
>>>> +       if (err) {
>>>> +
>>>> +               mlx5_vdpa_err(mvdev, "error issuing MTT mkey creation for direct mrs: %d\n", err);
>>>> +               goto done;
>>>> +       }
>>>> +
>>>> +       i = 0;
>>>> +       list_for_each_entry(dmr, &mr->head, list) {
>>>> +               struct mlx5_vdpa_async_cmd *cmd = &cmds[i++];
>>>> +               struct mlx5_create_mkey_mem *cmd_mem;
>>>> +
>>>> +               cmd_mem = container_of(cmd->out, struct mlx5_create_mkey_mem, out);
>>>> +
>>>> +               if (!cmd->err) {
>>>> +                       create_direct_mr_end(mvdev, dmr, cmd_mem);
>>>
>>> The caller function doesn't trust the result if we return an error.
>>> Why not use the previous loop to call create_direct_mr_end? Am I
>>> missing any side effect?
>>>
>> Which previous loop? We have the mkey value only after the command has
>> been executed.
> 
> Ok, now I see what I proposed didn't make sense, thanks!
> 
>> I added the if here (instead of always calling
>> create_direct_mr_end()) just to make things more explicit for the
>> reader.
>>
>>>> +               } else {
>>>> +                       err = err ? err : cmd->err;
>>>> +                       mlx5_vdpa_err(mvdev, "error creating MTT mkey [0x%llx, 0x%llx]: %d\n",
>>>> +                               dmr->start, dmr->end, cmd->err);
>>>> +               }
>>>> +       }
>>>> +
>>>> +done:
>>>> +       for (i = i-1; i >= 0; i--) {
>>>> +               struct mlx5_create_mkey_mem *cmd_mem;
>>>> +
>>>> +               cmd_mem = container_of(cmds[i].out, struct mlx5_create_mkey_mem, out);
>>>> +               kvfree(cmd_mem);
>>>> +       }
>>>> +
>>>> +       kvfree(cmds);
>>>> +       return err;
>>>> +}
>>>> +
>>>>  static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
>>>>  {
>>>>         int inlen;
>>>> @@ -279,14 +355,8 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr
>>>>                 goto err_map;
>>>>         }
>>>>
>>>> -       err = create_direct_mr(mvdev, mr);
>>>> -       if (err)
>>>> -               goto err_direct;
>>>> -
>>>>         return 0;
>>>>
>>>> -err_direct:
>>>> -       dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
>>>>  err_map:
>>>>         sg_free_table(&mr->sg_head);
>>>>         return err;
>>>> @@ -401,6 +471,10 @@ static int create_user_mr(struct mlx5_vdpa_dev *mvdev,
>>>>         if (err)
>>>>                 goto err_chain;
>>>>
>>>> +       err = create_direct_keys(mvdev, mr);
>>>> +       if (err)
>>>> +               goto err_chain;
>>>> +
>>>>         /* Create the memory key that defines the guests's address space. This
>>>>          * memory key refers to the direct keys that contain the MTT
>>>>          * translations
>>>> --
>>>> 2.45.1
>>>>
>>>
>>
>