lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAOSviJ3MNDOYJzJFjQDCjc04pGsktQ5vjQvDotqYoRwC2Wf=HQ@mail.gmail.com>
Date: Thu, 17 Apr 2025 01:27:55 +0530
From: Nitesh Shetty <nitheshshetty@...il.com>
To: Jens Axboe <axboe@...nel.dk>
Cc: Pavel Begunkov <asml.silence@...il.com>, Nitesh Shetty <nj.shetty@...sung.com>, gost.dev@...sung.com, 
	io-uring@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] io_uring/rsrc: send exact nr_segs for fixed buffer

On Wed, Apr 16, 2025 at 11:55 PM Jens Axboe <axboe@...nel.dk> wrote:
>
> On 4/16/25 9:07 AM, Jens Axboe wrote:
> > On 4/16/25 9:03 AM, Pavel Begunkov wrote:
> >> On 4/16/25 06:44, Nitesh Shetty wrote:
> >>> Sending exact nr_segs, avoids bio split check and processing in
> >>> block layer, which takes around 5%[1] of overall CPU utilization.
> >>>
> >>> In our setup, we see overall improvement of IOPS from 7.15M to 7.65M [2]
> >>> and 5% less CPU utilization.
> >>>
> >>> [1]
> >>>       3.52%  io_uring         [kernel.kallsyms]     [k] bio_split_rw_at
> >>>       1.42%  io_uring         [kernel.kallsyms]     [k] bio_split_rw
> >>>       0.62%  io_uring         [kernel.kallsyms]     [k] bio_submit_split
> >>>
> >>> [2]
> >>> sudo taskset -c 0,1 ./t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B1 -n2
> >>> -r4 /dev/nvme0n1 /dev/nvme1n1
> >>>
> >>> Signed-off-by: Nitesh Shetty <nj.shetty@...sung.com>
> >>> ---
> >>>   io_uring/rsrc.c | 3 +++
> >>>   1 file changed, 3 insertions(+)
> >>>
> >>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> >>> index b36c8825550e..6fd3a4a85a9c 100644
> >>> --- a/io_uring/rsrc.c
> >>> +++ b/io_uring/rsrc.c
> >>> @@ -1096,6 +1096,9 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
> >>>               iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
> >>>           }
> >>>       }
> >>> +    iter->nr_segs = (iter->bvec->bv_offset + iter->iov_offset +
> >>> +        iter->count + ((1UL << imu->folio_shift) - 1)) /
> >>> +        (1UL << imu->folio_shift);
> >>
> >> That's not going to work with ->is_kbuf as the segments are not uniform in
> >> size.
> >
> > Oops yes good point.
>
> How about something like this? Trims superflous end segments, if they
> exist. The 'offset' section already trimmed the front parts. For
> !is_kbuf that should be simple math, like in Nitesh's patch. For
> is_kbuf, iterate them.
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index bef66e733a77..e482ea1e22a9 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1036,6 +1036,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>                            struct io_mapped_ubuf *imu,
>                            u64 buf_addr, size_t len)
>  {
> +       const struct bio_vec *bvec;
>         unsigned int folio_shift;
>         size_t offset;
>         int ret;
> @@ -1052,9 +1053,10 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>          * Might not be a start of buffer, set size appropriately
>          * and advance us to the beginning.
>          */
> +       bvec = imu->bvec;
>         offset = buf_addr - imu->ubuf;
>         folio_shift = imu->folio_shift;
> -       iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
> +       iov_iter_bvec(iter, ddir, bvec, imu->nr_bvecs, offset + len);
>
>         if (offset) {
>                 /*
> @@ -1073,7 +1075,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>                  * since we can just skip the first segment, which may not
>                  * be folio_size aligned.
>                  */
> -               const struct bio_vec *bvec = imu->bvec;
>
>                 /*
>                  * Kernel buffer bvecs, on the other hand, don't necessarily
> @@ -1099,6 +1100,27 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>                 }
>         }
>
> +       /*
> +        * Offset trimmed front segments too, if any, now trim the tail.
> +        * For is_kbuf we'll iterate them as they may be different sizes,
> +        * otherwise we can just do straight up math.
> +        */
> +       if (len + offset < imu->len) {
> +               bvec = iter->bvec;
> +               if (imu->is_kbuf) {
> +                       while (len > bvec->bv_len) {
> +                               len -= bvec->bv_len;
> +                               bvec++;
> +                       }
> +                       iter->nr_segs = bvec - iter->bvec;
> +               } else {
> +                       size_t vec_len;
> +
> +                       vec_len = bvec->bv_offset + iter->iov_offset +
> +                                       iter->count + ((1UL << folio_shift) - 1);
> +                       iter->nr_segs = vec_len >> folio_shift;
> +               }
> +       }
>         return 0;
>  }
This might not be needed for is_kbuf , as they already update nr_seg
inside iov_iter_advance.

How about changing something like this ?

-               if (offset < bvec->bv_len) {
-                       iter->count -= offset;
-                       iter->iov_offset = offset;
-               } else if (imu->is_kbuf) {
+               if (!imu->is_kbuf) {
+                       size_t vec_len;
+
+                       if (offset < bvec->bv_len) {
+                               iter->count -= offset;
+                               iter->iov_offset = offset;
+                       } else {
+                               unsigned long seg_skip;
+
+                               /* skip first vec */
+                               offset -= bvec->bv_len;
+                               seg_skip = 1 + (offset >> folio_shift);
+
+                               iter->bvec += seg_skip;
+                               iter->count -= bvec->bv_len + offset;
+                               iter->iov_offset = offset & ((1UL <<
folio_shift) - 1);
+                       }
+                       vec_len = ALIGN(iter->bvec->bv_offset +
iter->iov_offset +
+                               iter->count, folio_shift;
+                       iter->nr_segs = vec_len >> folio_shift;
+               } else
                        iov_iter_advance(iter, offset);
-               } else {
-                       unsigned long seg_skip;
-
-                       /* skip first vec */
-                       offset -= bvec->bv_len;
-                       seg_skip = 1 + (offset >> folio_shift);
-
-                       iter->bvec += seg_skip;
-                       iter->count -= bvec->bv_len + offset;
-                       iter->iov_offset = offset & ((1UL << folio_shift) - 1);
-               }
        }

Regards,
Nitesh

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ