lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221004140230.748788-1-wangrui@loongson.cn>
Date:   Tue,  4 Oct 2022 22:02:30 +0800
From:   Rui Wang <wangrui@...ngson.cn>
To:     loongarch@...ts.linux.dev
Cc:     Huacai Chen <chenhuacai@...nel.org>,
        WANG Xuerui <kernel@...0n.name>, linux-kernel@...r.kernel.org,
        Rui Wang <wangrui@...ngson.cn>
Subject: [PATCH] LoongArch: mm: Refactor TLB handlers

This patch simplifies TLB load, store and modify exception handlers:

1. Reduce instructions. such as alu/csr and memory access
2. Execute tlbsrch only in the fast path.
3. Return directly from the fast path for huge pages.

And fixes the concurrent modification issue of fast path for huge pages.
This issue will occur in the following steps:

   CPU-1 (In TLB exception)         CPU-2 (In THP)
1: Load PMD entry (HUGE=1)
2: Goto huge path
3:                                  Store PMD entry (HUGE=0)
4: Reload PMD entry (HUGE=0)
5: Fill TLB entry (PA is incorrect)

This also slightly improves the TLB processing performance:

* Normal pages: 2.15%
* Huge pages:   1.70%

  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <sys/mman.h>

  int main(int argc, char *argv[])
  {
        size_t page_size;
        size_t mem_size;
        size_t off;
        void *base;
        int flags;
        int i;

        if (argc < 2) {
                fprintf(stderr, "%s MEM_SIZE [HUGE]\n", argv[0]);
                return -1;
        }

        page_size = sysconf(_SC_PAGESIZE);
        flags = MAP_PRIVATE | MAP_ANONYMOUS;
        mem_size = strtoul(argv[1], NULL, 10);
        if (argc > 2)
                flags |= MAP_HUGETLB;

        for (i = 0; i < 10; i++) {
                base = mmap(NULL, mem_size, PROT_READ, flags, -1, 0);
                if (base == MAP_FAILED) {
                        fprintf(stderr, "Map memory failed!\n");
                        return -1;
                }

                for (off = 0; off < mem_size; off += page_size)
                        *(volatile int *)(base + off);

                munmap(base, mem_size);
        }

        return 0;
  }

Signed-off-by: Rui Wang <wangrui@...ngson.cn>
---
 arch/loongarch/mm/tlbex.S | 223 ++++++++++++++++----------------------
 1 file changed, 93 insertions(+), 130 deletions(-)

diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 39743337999e..c97bcaad2ff4 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -10,6 +10,11 @@
 #include <asm/regdef.h>
 #include <asm/stackframe.h>
 
+#define PTRS_PER_PGD_BITS	(PAGE_SHIFT - 3)
+#define PTRS_PER_PUD_BITS	(PAGE_SHIFT - 3)
+#define PTRS_PER_PMD_BITS	(PAGE_SHIFT - 3)
+#define PTRS_PER_PTE_BITS	(PAGE_SHIFT - 3)
+
 	.macro tlb_do_page_fault, write
 	SYM_FUNC_START(tlb_do_page_fault_\write)
 	SAVE_ALL
@@ -52,25 +57,17 @@ SYM_FUNC_START(handle_tlb_load)
 
 vmalloc_done_load:
 	/* Get PGD offset in bytes */
-	srli.d	t0, t0, PGDIR_SHIFT
-	andi	t0, t0, (PTRS_PER_PGD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+	alsl.d		t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PUD_SHIFT
-	andi	t0, t0, (PTRS_PER_PUD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PMD_SHIFT
-	andi	t0, t0, (PTRS_PER_PMD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 	ld.d	ra, t1, 0
 
@@ -79,27 +76,20 @@ vmalloc_done_load:
 	 * instead contains the tlb pte. Check the PAGE_HUGE bit and
 	 * see if we need to jump to huge tlb processing.
 	 */
-	andi	t0, ra, _PAGE_HUGE
-	bnez	t0, tlb_huge_update_load
+	rotri.d	ra, ra, _PAGE_HUGE_SHIFT + 1
+	bltz	ra, tlb_huge_update_load
 
-	csrrd	t0, LOONGARCH_CSR_BADV
-	srli.d	t0, t0, PAGE_SHIFT
-	andi	t0, t0, (PTRS_PER_PTE - 1)
-	slli.d	t0, t0, _PTE_T_LOG2
-	add.d	t1, ra, t0
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	bstrpick.d	t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+	alsl.d	t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_load:
-#endif
-#ifdef CONFIG_SMP
 	ll.d	t0, t1, 0
 #else
 	ld.d	t0, t1, 0
 #endif
-	tlbsrch
-
-	srli.d	ra, t0, _PAGE_PRESENT_SHIFT
-	andi	ra, ra, 1
+	andi	ra, t0, _PAGE_PRESENT
 	beqz	ra, nopage_tlb_load
 
 	ori	t0, t0, _PAGE_VALID
@@ -109,8 +99,8 @@ smp_pgtable_change_load:
 #else
 	st.d	t0, t1, 0
 #endif
-	ori	t1, t1, 8
-	xori	t1, t1, 8
+	tlbsrch
+	bstrins.d	t1, zero, 3, 3
 	ld.d	t0, t1, 0
 	ld.d	t1, t1, 8
 	csrwr	t0, LOONGARCH_CSR_TLBELO0
@@ -133,23 +123,22 @@ vmalloc_load:
 	 */
 tlb_huge_update_load:
 #ifdef CONFIG_SMP
-	ll.d	t0, t1, 0
-#else
-	ld.d	t0, t1, 0
+	ll.d	ra, t1, 0
 #endif
-	srli.d	ra, t0, _PAGE_PRESENT_SHIFT
-	andi	ra, ra, 1
-	beqz	ra, nopage_tlb_load
-	tlbsrch
+	andi	t0, ra, _PAGE_PRESENT
+	beqz	t0, nopage_tlb_load
 
-	ori	t0, t0, _PAGE_VALID
 #ifdef CONFIG_SMP
+	ori	t0, ra, _PAGE_VALID
 	sc.d	t0, t1, 0
 	beqz	t0, tlb_huge_update_load
-	ld.d	t0, t1, 0
+	ori	t0, ra, _PAGE_VALID
 #else
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	ori	t0, ra, _PAGE_VALID
 	st.d	t0, t1, 0
 #endif
+	tlbsrch
 	addu16i.d	t1, zero, -(CSR_TLBIDX_EHINV >> 16)
 	addi.d		ra, t1, 0
 	csrxchg		ra, t1, LOONGARCH_CSR_TLBIDX
@@ -173,9 +162,8 @@ tlb_huge_update_load:
 	srli.d	t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
 	or	t0, t0, t1
 
-	addi.d	ra, t0, 0
-	csrwr	t0, LOONGARCH_CSR_TLBELO0
-	addi.d	t0, ra, 0
+	move	ra, t0
+	csrwr	ra, LOONGARCH_CSR_TLBELO0
 
 	/* Convert to entrylo1 */
 	addi.d	t1, zero, 1
@@ -193,6 +181,11 @@ tlb_huge_update_load:
 	addu16i.d	t0, zero, (CSR_TLBIDX_PS >> 16)
 	addu16i.d	t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
 	csrxchg		t1, t0, LOONGARCH_CSR_TLBIDX
+leave_huge_load:
+	csrrd	t0, EXCEPTION_KS0
+	csrrd	t1, EXCEPTION_KS1
+	csrrd	ra, EXCEPTION_KS2
+	ertn
 
 nopage_tlb_load:
 	dbar	0
@@ -215,26 +208,17 @@ SYM_FUNC_START(handle_tlb_store)
 
 vmalloc_done_store:
 	/* Get PGD offset in bytes */
-	srli.d	t0, t0, PGDIR_SHIFT
-	andi	t0, t0, (PTRS_PER_PGD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
-
+	bstrpick.d	ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+	alsl.d		t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PUD_SHIFT
-	andi	t0, t0, (PTRS_PER_PUD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PMD_SHIFT
-	andi	t0, t0, (PTRS_PER_PMD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 	ld.d	ra, t1, 0
 
@@ -243,28 +227,21 @@ vmalloc_done_store:
 	 * instead contains the tlb pte. Check the PAGE_HUGE bit and
 	 * see if we need to jump to huge tlb processing.
 	 */
-	andi	t0, ra, _PAGE_HUGE
-	bnez	t0, tlb_huge_update_store
+	rotri.d	ra, ra, _PAGE_HUGE_SHIFT + 1
+	bltz	ra, tlb_huge_update_store
 
-	csrrd	t0, LOONGARCH_CSR_BADV
-	srli.d	t0, t0, PAGE_SHIFT
-	andi	t0, t0, (PTRS_PER_PTE - 1)
-	slli.d	t0, t0, _PTE_T_LOG2
-	add.d	t1, ra, t0
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	bstrpick.d	t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+	alsl.d	t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_store:
-#endif
-#ifdef CONFIG_SMP
 	ll.d	t0, t1, 0
 #else
 	ld.d	t0, t1, 0
 #endif
-	tlbsrch
-
-	srli.d	ra, t0, _PAGE_PRESENT_SHIFT
-	andi	ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-	xori	ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
+	andi	ra, t0, _PAGE_PRESENT | _PAGE_WRITE
+	xori	ra, ra, _PAGE_PRESENT | _PAGE_WRITE
 	bnez	ra, nopage_tlb_store
 
 	ori	t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
@@ -274,9 +251,8 @@ smp_pgtable_change_store:
 #else
 	st.d	t0, t1, 0
 #endif
-
-	ori	t1, t1, 8
-	xori	t1, t1, 8
+	tlbsrch
+	bstrins.d	t1, zero, 3, 3
 	ld.d	t0, t1, 0
 	ld.d	t1, t1, 8
 	csrwr	t0, LOONGARCH_CSR_TLBELO0
@@ -299,25 +275,23 @@ vmalloc_store:
 	 */
 tlb_huge_update_store:
 #ifdef CONFIG_SMP
-	ll.d	t0, t1, 0
-#else
-	ld.d	t0, t1, 0
+	ll.d	ra, t1, 0
 #endif
-	srli.d	ra, t0, _PAGE_PRESENT_SHIFT
-	andi	ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-	xori	ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-	bnez	ra, nopage_tlb_store
-
-	tlbsrch
-	ori	t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+	andi	t0, ra, _PAGE_PRESENT | _PAGE_WRITE
+	xori	t0, t0, _PAGE_PRESENT | _PAGE_WRITE
+	bnez	t0, nopage_tlb_store
 
 #ifdef CONFIG_SMP
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 	sc.d	t0, t1, 0
 	beqz	t0, tlb_huge_update_store
-	ld.d	t0, t1, 0
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #else
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 	st.d	t0, t1, 0
 #endif
+	tlbsrch
 	addu16i.d	t1, zero, -(CSR_TLBIDX_EHINV >> 16)
 	addi.d		ra, t1, 0
 	csrxchg		ra, t1, LOONGARCH_CSR_TLBIDX
@@ -340,9 +314,8 @@ tlb_huge_update_store:
 	srli.d	t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
 	or	t0, t0, t1
 
-	addi.d	ra, t0, 0
-	csrwr	t0, LOONGARCH_CSR_TLBELO0
-	addi.d	t0, ra, 0
+	move	ra, t0
+	csrwr	ra, LOONGARCH_CSR_TLBELO0
 
 	/* Convert to entrylo1 */
 	addi.d	t1, zero, 1
@@ -361,6 +334,11 @@ tlb_huge_update_store:
 	addu16i.d	t0, zero, (CSR_TLBIDX_PS >> 16)
 	addu16i.d	t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
 	csrxchg		t1, t0, LOONGARCH_CSR_TLBIDX
+leave_huge_store:
+	csrrd	t0, EXCEPTION_KS0
+	csrrd	t1, EXCEPTION_KS1
+	csrrd	ra, EXCEPTION_KS2
+	ertn
 
 nopage_tlb_store:
 	dbar	0
@@ -383,25 +361,17 @@ SYM_FUNC_START(handle_tlb_modify)
 
 vmalloc_done_modify:
 	/* Get PGD offset in bytes */
-	srli.d	t0, t0, PGDIR_SHIFT
-	andi	t0, t0, (PTRS_PER_PGD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+	alsl.d		t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PUD_SHIFT
-	andi	t0, t0, (PTRS_PER_PUD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-	csrrd	t0, LOONGARCH_CSR_BADV
 	ld.d	t1, t1, 0
-	srli.d	t0, t0, PMD_SHIFT
-	andi	t0, t0, (PTRS_PER_PMD - 1)
-	slli.d	t0, t0, 3
-	add.d	t1, t1, t0
+	bstrpick.d	ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+	alsl.d		t1, ra, t1, 3
 #endif
 	ld.d	ra, t1, 0
 
@@ -410,27 +380,20 @@ vmalloc_done_modify:
 	 * instead contains the tlb pte. Check the PAGE_HUGE bit and
 	 * see if we need to jump to huge tlb processing.
 	 */
-	andi	t0, ra, _PAGE_HUGE
-	bnez	t0, tlb_huge_update_modify
+	rotri.d	ra, ra, _PAGE_HUGE_SHIFT + 1
+	bltz	ra, tlb_huge_update_modify
 
-	csrrd	t0, LOONGARCH_CSR_BADV
-	srli.d	t0, t0, PAGE_SHIFT
-	andi	t0, t0, (PTRS_PER_PTE - 1)
-	slli.d	t0, t0, _PTE_T_LOG2
-	add.d	t1, ra, t0
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	bstrpick.d	t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+	alsl.d	t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_modify:
-#endif
-#ifdef CONFIG_SMP
 	ll.d	t0, t1, 0
 #else
 	ld.d	t0, t1, 0
 #endif
-	tlbsrch
-
-	srli.d	ra, t0, _PAGE_WRITE_SHIFT
-	andi	ra, ra, 1
+	andi	ra, t0, _PAGE_WRITE
 	beqz	ra, nopage_tlb_modify
 
 	ori	t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
@@ -440,8 +403,8 @@ smp_pgtable_change_modify:
 #else
 	st.d	t0, t1, 0
 #endif
-	ori	t1, t1, 8
-	xori	t1, t1, 8
+	tlbsrch
+	bstrins.d	t1, zero, 3, 3
 	ld.d	t0, t1, 0
 	ld.d	t1, t1, 8
 	csrwr	t0, LOONGARCH_CSR_TLBELO0
@@ -464,23 +427,19 @@ vmalloc_modify:
 	 */
 tlb_huge_update_modify:
 #ifdef CONFIG_SMP
-	ll.d	t0, t1, 0
-#else
-	ld.d	t0, t1, 0
+	ll.d	ra, t1, 0
 #endif
-
-	srli.d	ra, t0, _PAGE_WRITE_SHIFT
-	andi	ra, ra, 1
-	beqz	ra, nopage_tlb_modify
-
-	tlbsrch
-	ori	t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+	andi	t0, ra, _PAGE_WRITE
+	beqz	t0, nopage_tlb_modify
 
 #ifdef CONFIG_SMP
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 	sc.d	t0, t1, 0
 	beqz	t0, tlb_huge_update_modify
-	ld.d	t0, t1, 0
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #else
+	rotri.d	ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+	ori	t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 	st.d	t0, t1, 0
 #endif
 	/*
@@ -499,9 +458,8 @@ tlb_huge_update_modify:
 	srli.d	t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
 	or	t0, t0, t1
 
-	addi.d	ra, t0, 0
-	csrwr	t0, LOONGARCH_CSR_TLBELO0
-	addi.d	t0, ra, 0
+	move	ra, t0
+	csrwr	ra, LOONGARCH_CSR_TLBELO0
 
 	/* Convert to entrylo1 */
 	addi.d	t1, zero, 1
@@ -520,6 +478,11 @@ tlb_huge_update_modify:
 	addu16i.d	t0, zero, (CSR_TLBIDX_PS >> 16)
 	addu16i.d	t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
 	csrxchg		t1, t0, LOONGARCH_CSR_TLBIDX
+leave_huge_modify:
+	csrrd	t0, EXCEPTION_KS0
+	csrrd	t1, EXCEPTION_KS1
+	csrrd	ra, EXCEPTION_KS2
+	ertn
 
 nopage_tlb_modify:
 	dbar	0
-- 
2.37.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ