[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAPhsuW4P5J1ZgUVKGfwWs7vX_7thc5cr1x6mrY8Wx3d2dqEDTw@mail.gmail.com>
Date: Mon, 25 Jun 2018 23:21:10 -0700
From: Song Liu <liu.song.a23@...il.com>
To: Jakub Kicinski <jakub.kicinski@...ronome.com>
Cc: Alexei Starovoitov <alexei.starovoitov@...il.com>,
Daniel Borkmann <daniel@...earbox.net>,
oss-drivers@...ronome.com, Networking <netdev@...r.kernel.org>,
Jiong Wang <jiong.wang@...ronome.com>
Subject: Re: [PATCH bpf-next 2/7] lib: reciprocal_div: implement the improved
algorithm on the paper mentioned
On Sun, Jun 24, 2018 at 8:54 PM, Jakub Kicinski
<jakub.kicinski@...ronome.com> wrote:
> From: Jiong Wang <jiong.wang@...ronome.com>
>
> The new added "reciprocal_value_adv" implements the advanced version of the
> algorithm described in Figure 4.2 of the paper except when dividend has MSB
> set which would require u128 divide on host and actually could be easily
> handled before calling the new "reciprocal_value_adv".
>
> The advanced version requires more complex calculation to get the
> reciprocal multiplier and other control variables, but then could reduce
> the required emulation operations.
>
> It makes no sense to use this advanced version for host divide emulation,
> those extra complexities for calculating multiplier etc could completely
> waive our saving on emulation operations.
>
> However, it makes sense to use it for JIT divide code generation (for
> example eBPF JIT backends) for which we are willing to trade performance of
> JITed code with that of host. As shown by the following pseudo code, the
> required emulation operations could go down from 6 (the basic version) to 3
> or 4.
>
> To use the result of "reciprocal_value_adv", suppose we want to calculate
> n/d, the C-style pseudo code will be the following, it could be easily
> changed to real code generation for other JIT targets.
>
> struct reciprocal_value_adv rvalue;
> u8 pre_shift, exp;
>
> if (d >= (1u << 31)) {
> result = n >= d;
> return;
> }
> rvalue = reciprocal_value_adv(d, 32)
> exp = rvalue.exp;
> if (rvalue.is_wide_m && !(d & 1)) {
> pre_shift = fls(d & -d) - 1;
> rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
> } else {
> pre_shift = 0;
> }
>
> // code generation starts.
> if (imm == 1 << exp) {
> result = n >> exp;
> } else if (rvalue.is_wide_m) {
> // pre_shift must be zero when reached here.
> t = (n * rvalue.m) >> 32;
> result = n - t;
> result >>= 1;
> result += t;
> result >>= rvalue.sh - 1;
> } else {
> if (pre_shift)
> result = n >> pre_shift;
> result = ((u64)result * rvalue.m) >> 32;
> result >>= rvalue.sh;
> }
>
> Signed-off-by: Jiong Wang <jiong.wang@...ronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
> ---
> include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
> lib/reciprocal_div.c | 37 +++++++++++++++++++
> 2 files changed, 102 insertions(+)
>
> diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
> index e031e9f2f9d8..5a695e4697d3 100644
> --- a/include/linux/reciprocal_div.h
> +++ b/include/linux/reciprocal_div.h
> @@ -25,6 +25,9 @@ struct reciprocal_value {
> u8 sh1, sh2;
> };
>
> +/* "reciprocal_value" and "reciprocal_divide" together implement the basic
> + * version of the algorithm described in Figure 4.1 of the paper.
> + */
> struct reciprocal_value reciprocal_value(u32 d);
>
> static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
> @@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
> return (t + ((a - t) >> R.sh1)) >> R.sh2;
> }
>
> +struct reciprocal_value_adv {
> + u32 m;
> + u8 sh, exp;
> + bool is_wide_m;
> +};
> +
> +/* "reciprocal_value_adv" implements the advanced version of the algorithm
> + * described in Figure 4.2 of the paper except when dividend has MSB set which
> + * would require u128 divide on host and actually could be easily handled before
> + * calling "reciprocal_value_adv".
> + *
> + * The advanced version requires more complex calculation to get the reciprocal
> + * multiplier and other control variables, but then could reduce the required
> + * emulation operations.
> + *
> + * It makes no sense to use this advanced version for host divide emulation,
> + * those extra complexities for calculating multiplier etc could completely
> + * waive our saving on emulation operations.
> + *
> + * However, it makes sense to use it for JIT divide code generation for which
> + * we are willing to trade performance of JITed code with that of host. As shown
> + * by the following pseudo code, the required emulation operations could go down
> + * from 6 (the basic version) to 3 or 4.
> + *
> + * To use the result of "reciprocal_value_adv", suppose we want to calculate
> + * n/d:
> + *
> + * struct reciprocal_value_adv rvalue;
> + * u8 pre_shift, exp;
> + *
> + * if (d >= (1u << 31)) {
> + * result = n >= d;
> + * return;
> + * }
> + * rvalue = reciprocal_value_adv(d, 32)
> + * exp = rvalue.exp;
> + * if (rvalue.is_wide_m && !(d & 1)) {
> + * pre_shift = fls(d & -d) - 1;
> + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
> + * } else {
> + * pre_shift = 0;
> + * }
> + *
> + * // code generation starts.
> + * if (imm == 1 << exp) {
> + * result = n >> exp;
> + * } else if (rvalue.is_wide_m) {
> + * // pre_shift must be zero when reached here.
> + * t = (n * rvalue.m) >> 32;
> + * result = n - t;
> + * result >>= 1;
> + * result += t;
> + * result >>= rvalue.sh - 1;
> + * } else {
> + * if (pre_shift)
> + * result = n >> pre_shift;
> + * result = ((u64)result * rvalue.m) >> 32;
> + * result >>= rvalue.sh;
> + * }
> + */
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
> +
> #endif /* _LINUX_RECIPROCAL_DIV_H */
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index fcb4ce682c6f..a41501ebad7c 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d)
> return R;
> }
> EXPORT_SYMBOL(reciprocal_value);
> +
> +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
> +{
> + struct reciprocal_value_adv R;
> + u32 l, post_shift;
> + u64 mhigh, mlow;
> +
> + l = fls(d - 1);
> + post_shift = l;
> + /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
> + * MSB set. This case needs to be handled before calling
> + * "reciprocal_value_adv", please see the comment at
> + * include/linux/reciprocal_div.h.
> + */
Shall we handle l == 32 case better? I guess the concern here is extra
handling may
slow down the fast path? If that's the case, we should at least add a
WARNING on the
slow path.
Thanks,
Song
> + mlow = 1ULL << (32 + l);
> + do_div(mlow, d);
> + mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
> + do_div(mhigh, d);
> +
> + for (; post_shift > 0; post_shift--) {
> + u64 lo = mlow >> 1, hi = mhigh >> 1;
> +
> + if (lo >= hi)
> + break;
> +
> + mlow = lo;
> + mhigh = hi;
> + }
> +
> + R.m = (u32)mhigh;
> + R.sh = post_shift;
> + R.exp = l;
> + R.is_wide_m = mhigh > U32_MAX;
> +
> + return R;
> +}
> +EXPORT_SYMBOL(reciprocal_value_adv);
> --
> 2.17.1
>
Powered by blists - more mailing lists