[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a16ccf51-4b06-4c6d-94a1-cb43dc3f2945@rivosinc.com>
Date: Fri, 7 Jun 2024 15:53:23 -0400
From: Jesse Taube <jesse@...osinc.com>
To: Charlie Jenkins <charlie@...osinc.com>
Cc: linux-riscv@...ts.infradead.org, Paul Walmsley
<paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>,
Albert Ou <aou@...s.berkeley.edu>, Conor Dooley
<conor.dooley@...rochip.com>, Evan Green <evan@...osinc.com>,
Andrew Jones <ajones@...tanamicro.com>, Xiao Wang <xiao.w.wang@...el.com>,
Clément Léger <cleger@...osinc.com>,
Andy Chiu <andy.chiu@...ive.com>, Greentime Hu <greentime.hu@...ive.com>,
Heiko Stuebner <heiko@...ech.de>, Guo Ren <guoren@...nel.org>,
Björn Töpel <bjorn@...osinc.com>,
Costa Shulyupin <costa.shul@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>, Baoquan He <bhe@...hat.com>,
Sami Tolvanen <samitolvanen@...gle.com>, Zong Li <zong.li@...ive.com>,
Ben Dooks <ben.dooks@...ethink.co.uk>, Erick Archer <erick.archer@....com>,
Vincent Chen <vincent.chen@...ive.com>,
Joel Granados <j.granados@...sung.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/3] RISC-V: Detect unaligned vector accesses supported.
On 6/6/24 19:13, Charlie Jenkins wrote:
> On Thu, Jun 06, 2024 at 02:29:23PM -0700, Charlie Jenkins wrote:
>> On Thu, Jun 06, 2024 at 02:32:14PM -0400, Jesse Taube wrote:
>>> Run a unaligned vector access to test if the system supports
>>> vector unaligned access. Add the result to a new key in hwprobe.
>>> This is useful for usermode to know if vector misaligned accesses are
>>> supported and if they are faster or slower than equivalent byte accesses.
>>>
>>> Signed-off-by: Jesse Taube <jesse@...osinc.com>
>>> ---
>>> arch/riscv/include/asm/cpufeature.h | 2 +
>>> arch/riscv/include/asm/hwprobe.h | 2 +-
>>> arch/riscv/include/asm/vector.h | 1 +
>>> arch/riscv/include/uapi/asm/hwprobe.h | 6 ++
>>> arch/riscv/kernel/sys_hwprobe.c | 34 +++++++++
>>> arch/riscv/kernel/traps_misaligned.c | 84 ++++++++++++++++++++--
>>> arch/riscv/kernel/unaligned_access_speed.c | 4 ++
>>> arch/riscv/kernel/vector.c | 2 +-
>>> 8 files changed, 129 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
>>> index 347805446151..a012c8490a27 100644
>>> --- a/arch/riscv/include/asm/cpufeature.h
>>> +++ b/arch/riscv/include/asm/cpufeature.h
>>> @@ -35,9 +35,11 @@ void riscv_user_isa_enable(void);
>>>
>>> #if defined(CONFIG_RISCV_MISALIGNED)
>>> bool check_unaligned_access_emulated_all_cpus(void);
>>> +bool check_vector_unaligned_access_all_cpus(void);
>>> void unaligned_emulation_finish(void);
>>> bool unaligned_ctl_available(void);
>>> DECLARE_PER_CPU(long, misaligned_access_speed);
>>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>> #else
>>> static inline bool unaligned_ctl_available(void)
>>> {
>>> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
>>> index 630507dff5ea..150a9877b0af 100644
>>> --- a/arch/riscv/include/asm/hwprobe.h
>>> +++ b/arch/riscv/include/asm/hwprobe.h
>>> @@ -8,7 +8,7 @@
>>>
>>> #include <uapi/asm/hwprobe.h>
>>>
>>> -#define RISCV_HWPROBE_MAX_KEY 6
>>> +#define RISCV_HWPROBE_MAX_KEY 7
>>>
>>> static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>> {
>>> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
>>> index 731dcd0ed4de..776af9b37e23 100644
>>> --- a/arch/riscv/include/asm/vector.h
>>> +++ b/arch/riscv/include/asm/vector.h
>>> @@ -21,6 +21,7 @@
>>>
>>> extern unsigned long riscv_v_vsize;
>>> int riscv_v_setup_vsize(void);
>>> +bool insn_is_vector(u32 insn_buf);
>>> bool riscv_v_first_use_handler(struct pt_regs *regs);
>>> void kernel_vector_begin(void);
>>> void kernel_vector_end(void);
>>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
>>> index 060212331a03..ebacff86f134 100644
>>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>>> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>>> #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0)
>>> #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0)
>>> #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6
>>> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7
>>> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0
>>> +#define RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED 1
>>> +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2
>>> +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3
>>> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4
>>> /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>>
>>> /* Flags */
>>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>>> index b286b73e763e..ce641cc6e47a 100644
>>> --- a/arch/riscv/kernel/sys_hwprobe.c
>>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>>> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>>> }
>>> #endif
>>>
>>> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>> +{
>>> + int cpu;
>>> + u64 perf = -1ULL;
>>> +
>>> + for_each_cpu(cpu, cpus) {
>>> + int this_perf = per_cpu(vector_misaligned_access, cpu);
>>> +
>>> + if (perf == -1ULL)
>>> + perf = this_perf;
>>> +
>>> + if (perf != this_perf) {
>>> + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> + break;
>>> + }
>>> + }
>>> +
>>> + if (perf == -1ULL)
>>> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +
>>> + return perf;
>>> +}
>>> +#else
>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>> +{
>
> I meant to mention this in my last message!
>
> The scalar version has cutouts for configs here like:
>
> if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS))
> return RISCV_HWPROBE_MISALIGNED_FAST;
>
Will add
> Having this functionality on vector as well would be much appreciated.
> I don't think it's valid to assume that vector and scalar have the same
> speed, so this would require a vector version of the RISCV_MISALIGNED
> tree in arch/riscv/Kconfig.
>
> - Charlie
>
>>> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +}
>>> +#endif
>>> +
>>> static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>> const struct cpumask *cpus)
>>> {
>>> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>> pair->value = hwprobe_misaligned(cpus);
>>> break;
>>>
>>> + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
>>> + pair->value = hwprobe_vec_misaligned(cpus);
>>> + break;
>>> +
>>> case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>> pair->value = 0;
>>> if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>>> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
>>> index 2adb7c3e4dd5..8f26c3d92230 100644
>>> --- a/arch/riscv/kernel/traps_misaligned.c
>>> +++ b/arch/riscv/kernel/traps_misaligned.c
>>> @@ -16,6 +16,7 @@
>>> #include <asm/entry-common.h>
>>> #include <asm/hwprobe.h>
>>> #include <asm/cpufeature.h>
>>> +#include <asm/vector.h>
>>>
>>> #define INSN_MATCH_LB 0x3
>>> #define INSN_MASK_LB 0x707f
>>> @@ -413,10 +414,6 @@ int handle_misaligned_load(struct pt_regs *regs)
>>>
>>> perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
>>>
>>> -#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>> - *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
>>> -#endif
>>> -
>>> if (!unaligned_enabled)
>>> return -1;
>>>
>>> @@ -426,6 +423,17 @@ int handle_misaligned_load(struct pt_regs *regs)
>>> if (get_insn(regs, epc, &insn))
>>> return -1;
>>>
>>> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>> + if (insn_is_vector(insn) &&
>>> + *this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED) {
>>> + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> + regs->epc = epc + INSN_LEN(insn);
>>> + return 0;
There is a return before scalar speed is set.
>>> + }
>>> +
>>> + *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
>>
>> This unconditionally sets scalar unaligned accesses even if the
>> unaligned access is caused by vector. Scalar unaligned accesses should
>> only be set to emulated if this function is entered from a scalar
>> unaligned load.
>>
>> The rest of this function handles how scalar unaligned accesses are
>> emulated, and the equivalent needs to happen for vector. You need to add
>> routines that manually load the data from the memory address into the
>> vector register. When Clément did this for scalar, he provided a test
>> case to help reviewers [1]. Please add onto these test cases or make
>> your own for vector.
I wansnt planing on adding emulation in this patch. I can if needed.
>>
>> Link: https://github.com/clementleger/unaligned_test [1]
>>
>>> +#endif
>>> +
>>> regs->epc = 0;
>>>
>>> if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
>>> @@ -625,6 +633,74 @@ static bool check_unaligned_access_emulated(int cpu)
>>> return misaligned_emu_detected;
>>> }
>>>
>>> +#ifdef CONFIG_RISCV_ISA_V
>>> +static void check_vector_unaligned_access(struct work_struct *unused)
>>
>> Can you standardize this name with the scalar version by writing
>> emulated in it?
We dont emulate it so that wouldn't make sence.
>>
>> "check_vector_unaligned_access_emulated_all_cpus"
>>
>>> +{
>>> + int cpu = smp_processor_id();
>>> + long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
>>> + unsigned long tmp_var;
>>> +
>>> + if (!riscv_isa_extension_available(hart_isa[cpu].isa, v))
>>> + return;
>>> +
>>> + *mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED;
>>> +
>>> + local_irq_enable();
>>> + kernel_vector_begin();
>>> + __asm__ __volatile__ (
>>> + ".balign 4\n\t"
>>> + ".option push\n\t"
>>> + ".option arch, +v\n\t"
>>> + " vsetivli zero, 1, e16, m1, ta, ma\n\t" // Vectors of 16b
>>> + " vle16.v v0, (%[ptr])\n\t" // Load bytes
>>> + ".option pop\n\t"
>>> + : : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0", "memory");
>>
>> memory is being read from, but not written to, so there is no need to
>> have a memory clobber.
fixed.
>>
>>> + kernel_vector_end();
>>> +
>>> + if (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>>> + *mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED;
>>> +}
>>> +
>>> +bool check_vector_unaligned_access_all_cpus(void)
>>> +{
>>> + int cpu;
>>> + bool ret = true;
>>> +
>>> + for_each_online_cpu(cpu)
>>> + if (riscv_isa_extension_available(hart_isa[cpu].isa, ZICCLSM))
>>
>> zicclsm is not specific to vector so it can be extracted out of this
>> vector specific function. Assuming that hardware properly reports the
>> extension, if zicclsm is present then it is known that both vector and
>> scalar unaligned accesses are supported.
Added so we don't need to waste cycles testing support.
>>> + per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED;
>>
>> Please use the exising UNKNOWN terminology instead of renaming to
>> SUPPORTED. Any option that is not UNSUPPORTED implies that unaligned
>> accesses are supported.
Conor didnt like using UNKNOWN a proxy for "SUPPORTED"
Having SUPPORTED is better then assuing the speed to be slow.
>>
>>> + else
>>> + ret = false;
>>> +
>>> +
>>> + if (ret)
>>> + return true;
>>> + ret = true;
>>> +
>>> + schedule_on_each_cpu(check_vector_unaligned_access);
>>> +
>>> + for_each_online_cpu(cpu)
>>> + if (per_cpu(vector_misaligned_access, cpu)
>>> + != RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED)
>>> + return false;
>>> +
>>> + return ret;
>>> +}
>>> +#else
>>
>> If CONFIG_RISCV_ISA_V is not set, there is no value in checking if
>> vector unaligned accesses are supported because userspace will not be
>> allowed to use vector instructions anyway.
Oh I'm silly meant to be seting to all UNSUPPORTED.
Thanks,
Jesse Taube
>>
>> - Charlie
>>
>>> +bool check_vector_unaligned_access_all_cpus(void)
>>> +{
>>> + int cpu;
>>> +
>>> + for_each_online_cpu(cpu)
>>> + if (riscv_isa_extension_available(hart_isa[cpu].isa, ZICCLSM))
>>> + per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_SUPPORTED;
>>> + else
>>> + per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> +
>>> + return false;
>>> +}
>>> +#endif
>>> +
>>> bool check_unaligned_access_emulated_all_cpus(void)
>>> {
>>> int cpu;
>>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>>> index a9a6bcb02acf..92a84239beaa 100644
>>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>>> @@ -20,6 +20,7 @@
>>> #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>>
>>> DEFINE_PER_CPU(long, misaligned_access_speed);
>>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>>
>>> #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>> static cpumask_t fast_misaligned_access;
>>> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>>> {
>>> bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>>>
>>> + check_vector_unaligned_access_all_cpus();
>>> +
>>> if (!all_cpus_emulated)
>>> return check_unaligned_access_speed_all_cpus();
>>>
>>> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>>> static int check_unaligned_access_all_cpus(void)
>>> {
>>> check_unaligned_access_emulated_all_cpus();
>>> + check_vector_unaligned_access_all_cpus();
>>>
>>> return 0;
>>> }
>>> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
>>> index 6727d1d3b8f2..2cceab739b2c 100644
>>> --- a/arch/riscv/kernel/vector.c
>>> +++ b/arch/riscv/kernel/vector.c
>>> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>>> #endif
>>> }
>>>
>>> -static bool insn_is_vector(u32 insn_buf)
>>> +bool insn_is_vector(u32 insn_buf)
>>> {
>>> u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>>> u32 width, csr;
>>> --
>>> 2.43.0
>>>
Powered by blists - more mailing lists