lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <C1EDD7DE-A0A9-4F43-9EC1-1C5A212A7033@jrtc27.com>
Date:   Tue, 12 Sep 2023 20:00:33 +0100
From:   Jessica Clarke <jrtc27@...c27.com>
To:     Guo Ren <guoren@...nel.org>
Cc:     Jisheng Zhang <jszhang@...nel.org>, Arnd Bergmann <arnd@...db.de>,
        Paul Walmsley <paul.walmsley@...ive.com>,
        Palmer Dabbelt <palmer@...belt.com>,
        Albert Ou <aou@...s.berkeley.edu>,
        linux-riscv <linux-riscv@...ts.infradead.org>,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH] riscv: errata: thead: use riscv_nonstd_cache_ops for CMO

On 12 Sep 2023, at 11:53, Guo Ren <guoren@...nel.org> wrote:
> 
> On Tue, Sep 12, 2023 at 03:27:40PM +0800, Jisheng Zhang wrote:
>> Previously, we use alternative mechanism to dynamically patch
>> the CMO operations for THEAD C906/C910 during boot for performance
>> reason. But as pointed out by Arnd, "there is already a significant
>> cost in accessing the invalidated cache lines afterwards, which is
>> likely going to be much higher than the cost of an indirect branch".
>> And indeed, there's no performance difference with GMAC and EMMC per
>> my test on Sipeed Lichee Pi 4A board.
>> 
>> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
>> the alternative code, and to acchieve Arnd's goal -- "I think
>> moving the THEAD ops at the same level as all nonstandard operations
>> makes sense, but I'd still leave CMO as an explicit fast path that
>> avoids the indirect branch. This seems like the right thing to do both
>> for readability and for platforms on which the indirect branch has a
>> noticeable overhead."
>> 
>> Signed-off-by: Jisheng Zhang <jszhang@...nel.org>
>> ---
>> arch/riscv/Kconfig.errata            |  1 +
>> arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
>> arch/riscv/include/asm/errata_list.h | 50 +++---------------
>> 3 files changed, 81 insertions(+), 46 deletions(-)
>> 
>> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
>> index 566bcefeab50..d7972914f9de 100644
>> --- a/arch/riscv/Kconfig.errata
>> +++ b/arch/riscv/Kconfig.errata
>> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
>> bool "Apply T-Head cache management errata"
>> depends on ERRATA_THEAD && MMU
>> select RISCV_DMA_NONCOHERENT
>> + select RISCV_NONSTANDARD_CACHE_OPS
>> default y
>> help
>>   This will apply the cache management errata to handle the
>> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
>> index 0554ed4bf087..1c320abfe446 100644
>> --- a/arch/riscv/errata/thead/errata.c
>> +++ b/arch/riscv/errata/thead/errata.c
>> @@ -12,8 +12,10 @@
>> #include <asm/alternative.h>
>> #include <asm/cacheflush.h>
>> #include <asm/cpufeature.h>
>> +#include <asm/dma-noncoherent.h>
>> #include <asm/errata_list.h>
>> #include <asm/hwprobe.h>
>> +#include <asm/io.h>
>> #include <asm/patch.h>
>> #include <asm/vendorid_list.h>
>> 
>> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
>> return false;
>> }
>> 
>> +/*
>> + * dcache.ipa rs1 (invalidate, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01010      rs1       000      00000  0001011
>> + * dache.iva rs1 (invalida, virtual address)
>> + *   0000001    00110      rs1       000      00000  0001011
> Remove dache.iva rs1 ...
> 
>> + *
>> + * dcache.cpa rs1 (clean, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01001      rs1       000      00000  0001011
> Remove dcache.cpa rs1 ...
> 
>> + * dcache.cva rs1 (clean, virtual address)
>> + *   0000001    00101      rs1       000      00000  0001011
> Remove dcache.cva rs1 ...
> 
>> + *
>> + * dcache.cipa rs1 (clean then invalidate, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01011      rs1       000      00000  0001011
>> + * dcache.civa rs1 (... virtual address)
>> + *   0000001    00111      rs1       000      00000  0001011
> Remove dcache.civa rs1 ...
> 
>> + *
>> + * sync.s (make sure all cache operations finished)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000000    11001     00000      000      00000  0001011
>> + */
>> +#define THEAD_inval_A0 ".long 0x0265000b"
>> +#define THEAD_clean_A0 ".long 0x0255000b"
>> +#define THEAD_flush_A0 ".long 0x0275000b"
>> +#define THEAD_SYNC_S ".long 0x0190000b"
>> +
>> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize) \
>> +asm volatile("mv a0, %1\n\t" \
>> +      "j 2f\n\t" \
>> +      "3:\n\t" \
>> +      THEAD_##_op##_A0 "\n\t" \
>> +      "add a0, a0, %0\n\t" \
>> +      "2:\n\t" \
>> +      "bltu a0, %2, 3b\n\t" \
>> +      THEAD_SYNC_S \
>> +      : : "r"(_cachesize), \
>> +  "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
>> +  "r"((unsigned long)(_start) + (_size)) \
>> +      : "a0")
>> +
>> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
> No need to phys_to_virt, and we could use paddr directly (dcache.ipa
> rs1).
> 
>> +
>> + THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
>> +}
>> +
>> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
>> +
>> + THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
>> +}
> Please remove the thead_errata_cache_wback because T-HEAD processors
> would prioritize using an invalid cacheline instead of evicting an
> existing cacheline. When we do dcache clean, the following operations
> are to let other interconnect masters read. So, keeping wback_inv for
> T-HEAD processors is the best choice, and maybe some other processors'
> vendor has a different idea, but please use the wback_inv instead of
> wback_only for the T-HEAD processors.

Unless you can demonstrate that your cores have significantly worse
performance when using wback instead of wback_inv I do not think the
non-standard implementation should deviate from the semantics of the
standard one. There are efforts to unify the implemented semantics of
the operations across architectures and this would obstruct those.

Jess

>> +
>> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
> Ditto.
> 
>> +
>> + THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
>> +}
>> +
>> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
>> + .wback = &thead_errata_cache_wback,
> Please  .wback = &thead_errata_cache_wback_inv,
> 
> 
>> + .inv = &thead_errata_cache_inv,
>> + .wback_inv = &thead_errata_cache_wback_inv,
>> +};
>> +
>> static bool errata_probe_cmo(unsigned int stage,
>>      unsigned long arch_id, unsigned long impid)
>> {
>> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
>> if (stage == RISCV_ALTERNATIVES_BOOT) {
>> riscv_cbom_block_size = L1_CACHE_BYTES;
>> riscv_noncoherent_supported();
>> + if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
>> + riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
>> }
>> 
>> return true;
>> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
>> if (errata_probe_pbmt(stage, archid, impid))
>> cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
>> 
>> - if (errata_probe_cmo(stage, archid, impid))
>> - cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
>> + errata_probe_cmo(stage, archid, impid);
>> 
>> if (errata_probe_pmu(stage, archid, impid))
>> cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
>> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
>> index b55b434f0059..ea33288f8a25 100644
>> --- a/arch/riscv/include/asm/errata_list.h
>> +++ b/arch/riscv/include/asm/errata_list.h
>> @@ -24,9 +24,8 @@
>> 
>> #ifdef CONFIG_ERRATA_THEAD
>> #define ERRATA_THEAD_PBMT 0
>> -#define ERRATA_THEAD_CMO 1
>> -#define ERRATA_THEAD_PMU 2
>> -#define ERRATA_THEAD_NUMBER 3
>> +#define ERRATA_THEAD_PMU 1
>> +#define ERRATA_THEAD_NUMBER 2
>> #endif
>> 
>> #ifdef __ASSEMBLY__
>> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE( \
>> #define ALT_THEAD_PMA(_val)
>> #endif
>> 
>> -/*
>> - * dcache.ipa rs1 (invalidate, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01010      rs1       000      00000  0001011
>> - * dache.iva rs1 (invalida, virtual address)
>> - *   0000001    00110      rs1       000      00000  0001011
>> - *
>> - * dcache.cpa rs1 (clean, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01001      rs1       000      00000  0001011
>> - * dcache.cva rs1 (clean, virtual address)
>> - *   0000001    00101      rs1       000      00000  0001011
>> - *
>> - * dcache.cipa rs1 (clean then invalidate, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01011      rs1       000      00000  0001011
>> - * dcache.civa rs1 (... virtual address)
>> - *   0000001    00111      rs1       000      00000  0001011
>> - *
>> - * sync.s (make sure all cache operations finished)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000000    11001     00000      000      00000  0001011
>> - */
>> -#define THEAD_inval_A0 ".long 0x0265000b"
>> -#define THEAD_clean_A0 ".long 0x0255000b"
>> -#define THEAD_flush_A0 ".long 0x0275000b"
>> -#define THEAD_SYNC_S ".long 0x0190000b"
>> -
>> #define ALT_CMO_OP(_op, _start, _size, _cachesize) \
>> -asm volatile(ALTERNATIVE_2( \
>> - __nops(6), \
>> +asm volatile(ALTERNATIVE( \
>> + __nops(5), \
>> "mv a0, %1\n\t" \
>> "j 2f\n\t" \
>> "3:\n\t" \
>> CBO_##_op(a0) \
>> "add a0, a0, %0\n\t" \
>> "2:\n\t" \
>> - "bltu a0, %2, 3b\n\t" \
>> - "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \
>> - "mv a0, %1\n\t" \
>> - "j 2f\n\t" \
>> - "3:\n\t" \
>> - THEAD_##_op##_A0 "\n\t" \
>> - "add a0, a0, %0\n\t" \
>> - "2:\n\t" \
>> - "bltu a0, %2, 3b\n\t" \
>> - THEAD_SYNC_S, THEAD_VENDOR_ID, \
>> - ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO) \
>> + "bltu a0, %2, 3b\n\t", \
>> + 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM) \
>> : : "r"(_cachesize), \
>>     "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
>>     "r"((unsigned long)(_start) + (_size)) \
>> -- 
>> 2.40.1
>> 
>> 
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@...ts.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@...ts.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ