netdev - Re: [PATCH bpf-next 2/7] lib: reciprocal_div: implement the improved algorithm on the paper mentioned

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAPhsuW4P5J1ZgUVKGfwWs7vX_7thc5cr1x6mrY8Wx3d2dqEDTw@mail.gmail.com>
Date:   Mon, 25 Jun 2018 23:21:10 -0700
From:   Song Liu <liu.song.a23@...il.com>
To:     Jakub Kicinski <jakub.kicinski@...ronome.com>
Cc:     Alexei Starovoitov <alexei.starovoitov@...il.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        oss-drivers@...ronome.com, Networking <netdev@...r.kernel.org>,
        Jiong Wang <jiong.wang@...ronome.com>
Subject: Re: [PATCH bpf-next 2/7] lib: reciprocal_div: implement the improved
 algorithm on the paper mentioned

On Sun, Jun 24, 2018 at 8:54 PM, Jakub Kicinski
<jakub.kicinski@...ronome.com> wrote:
> From: Jiong Wang <jiong.wang@...ronome.com>
>
> The new added "reciprocal_value_adv" implements the advanced version of the
> algorithm described in Figure 4.2 of the paper except when dividend has MSB
> set which would require u128 divide on host and actually could be easily
> handled before calling the new "reciprocal_value_adv".
>
> The advanced version requires more complex calculation to get the
> reciprocal multiplier and other control variables, but then could reduce
> the required emulation operations.
>
> It makes no sense to use this advanced version for host divide emulation,
> those extra complexities for calculating multiplier etc could completely
> waive our saving on emulation operations.
>
> However, it makes sense to use it for JIT divide code generation (for
> example eBPF JIT backends) for which we are willing to trade performance of
> JITed code with that of host. As shown by the following pseudo code, the
> required emulation operations could go down from 6 (the basic version) to 3
> or 4.
>
> To use the result of "reciprocal_value_adv", suppose we want to calculate
> n/d, the C-style pseudo code will be the following, it could be easily
> changed to real code generation for other JIT targets.
>
>   struct reciprocal_value_adv rvalue;
>   u8 pre_shift, exp;
>
>   if (d >= (1u << 31)) {
>     result = n >= d;
>     return;
>   }
>   rvalue = reciprocal_value_adv(d, 32)
>   exp = rvalue.exp;
>   if (rvalue.is_wide_m && !(d & 1)) {
>     pre_shift = fls(d & -d) - 1;
>     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
>   } else {
>     pre_shift = 0;
>   }
>
>   // code generation starts.
>   if (imm == 1 << exp) {
>     result = n >> exp;
>   } else if (rvalue.is_wide_m) {
>     // pre_shift must be zero when reached here.
>     t = (n * rvalue.m) >> 32;
>     result = n - t;
>     result >>= 1;
>     result += t;
>     result >>= rvalue.sh - 1;
>   } else {
>     if (pre_shift)
>       result = n >> pre_shift;
>     result = ((u64)result * rvalue.m) >> 32;
>     result >>= rvalue.sh;
>   }
>
> Signed-off-by: Jiong Wang <jiong.wang@...ronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
> ---
>  include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
>  lib/reciprocal_div.c           | 37 +++++++++++++++++++
>  2 files changed, 102 insertions(+)
>
> diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
> index e031e9f2f9d8..5a695e4697d3 100644
> --- a/include/linux/reciprocal_div.h
> +++ b/include/linux/reciprocal_div.h
> @@ -25,6 +25,9 @@ struct reciprocal_value {
>         u8 sh1, sh2;
>  };
>
> +/* "reciprocal_value" and "reciprocal_divide" together implement the basic
> + * version of the algorithm described in Figure 4.1 of the paper.
> + */
>  struct reciprocal_value reciprocal_value(u32 d);
>
>  static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
> @@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
>         return (t + ((a - t) >> R.sh1)) >> R.sh2;
>  }
>
> +struct reciprocal_value_adv {
> +       u32 m;
> +       u8 sh, exp;
> +       bool is_wide_m;
> +};
> +
> +/* "reciprocal_value_adv" implements the advanced version of the algorithm
> + * described in Figure 4.2 of the paper except when dividend has MSB set which
> + * would require u128 divide on host and actually could be easily handled before
> + * calling "reciprocal_value_adv".
> + *
> + * The advanced version requires more complex calculation to get the reciprocal
> + * multiplier and other control variables, but then could reduce the required
> + * emulation operations.
> + *
> + * It makes no sense to use this advanced version for host divide emulation,
> + * those extra complexities for calculating multiplier etc could completely
> + * waive our saving on emulation operations.
> + *
> + * However, it makes sense to use it for JIT divide code generation for which
> + * we are willing to trade performance of JITed code with that of host. As shown
> + * by the following pseudo code, the required emulation operations could go down
> + * from 6 (the basic version) to 3 or 4.
> + *
> + * To use the result of "reciprocal_value_adv", suppose we want to calculate
> + * n/d:
> + *
> + *   struct reciprocal_value_adv rvalue;
> + *   u8 pre_shift, exp;
> + *
> + *   if (d >= (1u << 31)) {
> + *     result = n >= d;
> + *     return;
> + *   }
> + *   rvalue = reciprocal_value_adv(d, 32)
> + *   exp = rvalue.exp;
> + *   if (rvalue.is_wide_m && !(d & 1)) {
> + *     pre_shift = fls(d & -d) - 1;
> + *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
> + *   } else {
> + *     pre_shift = 0;
> + *   }
> + *
> + *   // code generation starts.
> + *   if (imm == 1 << exp) {
> + *     result = n >> exp;
> + *   } else if (rvalue.is_wide_m) {
> + *     // pre_shift must be zero when reached here.
> + *     t = (n * rvalue.m) >> 32;
> + *     result = n - t;
> + *     result >>= 1;
> + *     result += t;
> + *     result >>= rvalue.sh - 1;
> + *   } else {
> + *     if (pre_shift)
> + *       result = n >> pre_shift;
> + *     result = ((u64)result * rvalue.m) >> 32;
> + *     result >>= rvalue.sh;
> + *   }
> + */
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
> +
>  #endif /* _LINUX_RECIPROCAL_DIV_H */
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index fcb4ce682c6f..a41501ebad7c 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d)
>         return R;
>  }
>  EXPORT_SYMBOL(reciprocal_value);
> +
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
> +{
> +       struct reciprocal_value_adv R;
> +       u32 l, post_shift;
> +       u64 mhigh, mlow;
> +
> +       l = fls(d - 1);
> +       post_shift = l;
> +       /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
> +        * MSB set. This case needs to be handled before calling
> +        * "reciprocal_value_adv", please see the comment at
> +        * include/linux/reciprocal_div.h.
> +        */

Shall we handle l == 32 case better? I guess the concern here is extra
handling may
slow down the fast path? If that's the case, we should at least add a
WARNING on the
slow path.

Thanks,
Song


> +       mlow = 1ULL << (32 + l);
> +       do_div(mlow, d);
> +       mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
> +       do_div(mhigh, d);
> +
> +       for (; post_shift > 0; post_shift--) {
> +               u64 lo = mlow >> 1, hi = mhigh >> 1;
> +
> +               if (lo >= hi)
> +                       break;
> +
> +               mlow = lo;
> +               mhigh = hi;
> +       }
> +
> +       R.m = (u32)mhigh;
> +       R.sh = post_shift;
> +       R.exp = l;
> +       R.is_wide_m = mhigh > U32_MAX;
> +
> +       return R;
> +}
> +EXPORT_SYMBOL(reciprocal_value_adv);
> --
> 2.17.1
>