linux-kernel - Re: [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not supported on secondary CPUs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <e304762f-6d66-44f4-8fe4-7ca866f93d69@os.amperecomputing.com>
Date: Mon, 23 Jun 2025 13:56:56 -0700
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, will@...nel.org,
 catalin.marinas@....com, Miko.Lenczewski@....com, dev.jain@....com,
 scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not
 supported on secondary CPUs



On 6/23/25 5:26 AM, Ryan Roberts wrote:
> On 31/05/2025 03:41, Yang Shi wrote:
>> The kernel linear mapping is painted in very early stage of system boot.
>> The cpufeature has not been finalized yet at this point.  So the linear
>> mapping is determined by the capability of boot CPU.  If the boot CPU
>> supports BBML2, large block mapping will be used for linear mapping.
>>
>> But the secondary CPUs may not support BBML2, so repaint the linear mapping
>> if large block mapping is used and the secondary CPUs don't support BBML2
>> once cpufeature is finalized on all CPUs.
>>
>> If the boot CPU doesn't support BBML2 or the secondary CPUs have the
>> same BBML2 capability with the boot CPU, repainting the linear mapping
>> is not needed.
>>
>> Signed-off-by: Yang Shi <yang@...amperecomputing.com>
>> ---
>>   arch/arm64/include/asm/mmu.h   |   3 +
>>   arch/arm64/kernel/cpufeature.c |  16 +++++
>>   arch/arm64/mm/mmu.c            | 108 ++++++++++++++++++++++++++++++++-
>>   arch/arm64/mm/proc.S           |  41 +++++++++++++
>>   4 files changed, 166 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 2693d63bf837..ad38135d1aa1 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -56,6 +56,8 @@ typedef struct {
>>    */
>>   #define ASID(mm)	(atomic64_read(&(mm)->context.id) & 0xffff)
>>   
>> +extern bool block_mapping;
> Is this really useful to cache? Why not just call force_pte_mapping() instead?
> Its the inverse. It's also not a great name for a global variable.

We can use force_pte_mapping().

>
> But perhaps it is better to cache a boolean that also reflects the bbml2 status:
>
> bool linear_map_requires_bbml2;
>
> Then create_idmap() will only bother to add to the idmap if there is a chance
> you will need to repaint. And repaint_linear_mappings() won't need to explcitly
> check !rodata_full.

OK, IIUC linear_map_requires_bbml2 = !force_pte_mapping() && rodata_full

>
> I think this can be __initdata too?

Yes, it is just used at bootup stage.

>
>> +
>>   static inline bool arm64_kernel_unmapped_at_el0(void)
>>   {
>>   	return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0);
>> @@ -72,6 +74,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>>   extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>>   extern void mark_linear_text_alias_ro(void);
>>   extern int split_linear_mapping(unsigned long start, unsigned long end);
>> +extern int __repaint_linear_mappings(void *__unused);
> nit: "repaint_linear_mappings" is a bit vague. How about
> linear_map_split_to_ptes() or similar?

Sounds good to me.

>
>>   
>>   /*
>>    * This check is triggered during the early boot before the cpufeature
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 5fc2a4a804de..5151c101fbaf 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -85,6 +85,7 @@
>>   #include <asm/insn.h>
>>   #include <asm/kvm_host.h>
>>   #include <asm/mmu_context.h>
>> +#include <asm/mmu.h>
>>   #include <asm/mte.h>
>>   #include <asm/hypervisor.h>
>>   #include <asm/processor.h>
>> @@ -2005,6 +2006,20 @@ static int __init __kpti_install_ng_mappings(void *__unused)
>>   	return 0;
>>   }
>>   
>> +static void __init repaint_linear_mappings(void)
>> +{
>> +	if (!block_mapping)
>> +		return;
>> +
>> +	if (!rodata_full)
>> +		return;
>> +
>> +	if (system_supports_bbml2_noabort())
>> +		return;
>> +
>> +	stop_machine(__repaint_linear_mappings, NULL, cpu_online_mask);
> With the above suggestions, I think this can be simplified to something like:
>
> static void __init linear_map_maybe_split_to_ptes(void)
> {
> 	if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort())
> 		stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
> }

Yeah, ack.

>
>> +}
>> +
>>   static void __init kpti_install_ng_mappings(void)
>>   {
>>   	/* Check whether KPTI is going to be used */
>> @@ -3868,6 +3883,7 @@ void __init setup_system_features(void)
>>   {
>>   	setup_system_capabilities();
>>   
>> +	repaint_linear_mappings();
>>   	kpti_install_ng_mappings();
>>   
>>   	sve_setup();
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 4c5d3aa35d62..3922af89abbb 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -209,6 +209,8 @@ static void split_pmd(pmd_t pmd, phys_addr_t pte_phys, int flags)
>>   	/* It must be naturally aligned if PMD is leaf */
>>   	if ((flags & NO_CONT_MAPPINGS) == 0)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>> +	else
>> +		prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
>>   
>>   	for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>>   		__set_pte_nosync(ptep, pfn_pte(pfn, prot));
>> @@ -230,6 +232,8 @@ static void split_pud(pud_t pud, phys_addr_t pmd_phys, int flags)
>>   	/* It must be naturally aligned if PUD is leaf */
>>   	if ((flags & NO_CONT_MAPPINGS) == 0)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>> +	else
>> +		prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
>>   
>>   	for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
>>   		__set_pmd_nosync(pmdp, pfn_pmd(pfn, prot));
>> @@ -833,6 +837,86 @@ void __init mark_linear_text_alias_ro(void)
>>   			    PAGE_KERNEL_RO);
>>   }
>>   
>> +static phys_addr_t repaint_pgtable_alloc(int shift)
>> +{
>> +	void *ptr;
>> +
>> +	ptr = (void *)__get_free_page(GFP_ATOMIC);
>> +	if (!ptr)
>> +		return -1;
>> +
>> +	return __pa(ptr);
>> +}
>> +
>> +extern u32 repaint_done;
>> +
>> +int __init __repaint_linear_mappings(void *__unused)
>> +{
>> +	typedef void (repaint_wait_fn)(void);
>> +	extern repaint_wait_fn bbml2_wait_for_repainting;
>> +	repaint_wait_fn *wait_fn;
>> +
>> +	phys_addr_t kernel_start = __pa_symbol(_stext);
>> +	phys_addr_t kernel_end = __pa_symbol(__init_begin);
>> +	phys_addr_t start, end;
>> +	unsigned long vstart, vend;
>> +	u64 i;
>> +	int ret;
>> +	int flags = NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS |
>> +		    SPLIT_MAPPINGS;
>> +	int cpu = smp_processor_id();
> nit: most of these variables are only needed by cpu 0 so you could defer their
> initialization until inside the if condition below.

OK

>
>> +
>> +	wait_fn = (void *)__pa_symbol(bbml2_wait_for_repainting);
>> +
>> +	/*
>> +	 * Repainting just can be run on CPU 0 because we just can be sure
>> +	 * CPU 0 supports BBML2.
>> +	 */
>> +	if (!cpu) {
>> +		/*
>> +		 * Wait for all secondary CPUs get prepared for repainting
>> +		 * the linear mapping.
>> +		 */
>> +wait_for_secondary:
>> +		if (READ_ONCE(repaint_done) != num_online_cpus())
>> +			goto wait_for_secondary;
> This feels suspect when comparing against the assembly code that does a similar
> sync operation in idmap_kpti_install_ng_mappings:
>
> 	/* We're the boot CPU. Wait for the others to catch up */
> 	sevl
> 1:	wfe
> 	ldaxr	w17, [flag_ptr]
> 	eor	w17, w17, num_cpus
> 	cbnz	w17, 1b
>
> The acquire semantics of the ldaxr are needed here to ensure that program-order
> future memory accesses don't get reordered before. READ_ONCE() is relaxed so
> permits reordering.
>
> The wfe means the CPU is not just furiously spinning, but actually waiting for a
> secondary cpu exclusively write to the variable at flag_ptr.
>
> I think you can drop the whole loop and just call:
>
> 	smp_cond_load_acquire(&repaint_done, VAL == num_online_cpus());

Yeah, it seems better.

>
>> +
>> +		memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>> +		/* Split the whole linear mapping */
>> +		for_each_mem_range(i, &start, &end) {
>> +			if (start >= end)
>> +				return -EINVAL;
>> +
>> +			vstart = __phys_to_virt(start);
>> +			vend = __phys_to_virt(end);
>> +			ret = __create_pgd_mapping_locked(init_mm.pgd, start,
>> +					vstart, (end - start), __pgprot(0),
>> +					repaint_pgtable_alloc, flags);
>> +			if (ret)
>> +				panic("Failed to split linear mappings\n");
>> +
>> +			flush_tlb_kernel_range(vstart, vend);
>> +		}
>> +		memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
> You're relyng on the memblock API here. Is that valid, given we are quite late
> into the boot at this point and we have transferred control to the buddy?

Yes, it is still valid at this point. The memblock won't be discarded 
until page_alloc_init_late(), it is called after smp_init(). And whether 
discarding memblock also depends on !CONFIG_ARCH_KEEP_MEMBLOCK, however 
arm64 (and some other arches, i.e. x86, riscv, etc) selects this config. 
It means memblock is always valid.

>
> I was thinking you would just need to traverse the the linear map region of the
> kernel page table, splitting each large leaf you find into the next size down?

The benefit with using memblock is we can easily skip the memory for 
kernel itself. Kernel itself is always mapped by block mappings, we 
don't want to split it. Traversing page table is hard to tell whether 
the mapping is for kernel itself or not.

>
>> +
>> +		WRITE_ONCE(repaint_done, 0);
> I think this depends on the dsb(ish) in flush_tlb_kernel_range() to ensure it is
> not re-ordered before any pgtable split operations? Might be worth a comment.

Sure.

>
>
>> +	} else {
>> +		/*
>> +		 * The secondary CPUs can't run in the same address space
>> +		 * with CPU 0 because accessing the linear mapping address
>> +		 * when CPU 0 is repainting it is not safe.
>> +		 *
>> +		 * Let the secondary CPUs run busy loop in idmap address
>> +		 * space when repainting is ongoing.
>> +		 */
>> +		cpu_install_idmap();
>> +		wait_fn();
>> +		cpu_uninstall_idmap();
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   #ifdef CONFIG_KFENCE
>>   
>>   bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
>> @@ -887,6 +971,8 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>>   
>>   #endif /* CONFIG_KFENCE */
>>   
>> +bool block_mapping;
>> +>  static inline bool force_pte_mapping(void)
>>   {
>>   	/*
>> @@ -915,6 +1001,8 @@ static void __init map_mem(pgd_t *pgdp)
>>   	int flags = NO_EXEC_MAPPINGS;
>>   	u64 i;
>>   
>> +	block_mapping = true;
>> +
>>   	/*
>>   	 * Setting hierarchical PXNTable attributes on table entries covering
>>   	 * the linear region is only possible if it is guaranteed that no table
>> @@ -930,8 +1018,10 @@ static void __init map_mem(pgd_t *pgdp)
>>   
>>   	early_kfence_pool = arm64_kfence_alloc_pool();
>>   
>> -	if (force_pte_mapping())
>> +	if (force_pte_mapping()) {
>> +		block_mapping = false;
>>   		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> +	}
>>   
>>   	/*
>>   	 * Take care not to create a writable alias for the
>> @@ -1063,7 +1153,8 @@ void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
>>   		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
>>   
>>   static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
>> -	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
>> +	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
>> +	  bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
>>   
>>   static void __init create_idmap(void)
>>   {
>> @@ -1088,6 +1179,19 @@ static void __init create_idmap(void)
>>   			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
>>   			       __phys_to_virt(ptep) - ptep);
>>   	}
>> +
>> +	/*
>> +	 * Setup idmap mapping for repaint_done flag.  It will be used if
>> +	 * repainting the linear mapping is needed later.
>> +	 */
>> +	if (block_mapping) {
>> +		u64 pa = __pa_symbol(&repaint_done);
>> +		ptep = __pa_symbol(bbml2_ptes);
>> +
>> +		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
>> +			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
>> +			       __phys_to_virt(ptep) - ptep);
>> +	}
>>   }
>>   
>>   void __init paging_init(void)
>> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
>> index fb30c8804f87..c40e6126c093 100644
>> --- a/arch/arm64/mm/proc.S
>> +++ b/arch/arm64/mm/proc.S
>> @@ -440,6 +440,47 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings)
>>   	.popsection
>>   #endif
>>   
>> +/*
>> + * Wait for repainting is done. Run on secondary CPUs
>> + * only.
>> + */
>> +	.pushsection	".data", "aw", %progbits
>> +SYM_DATA(repaint_done, .long 1)
>> +	.popsection
>> +
>> +	.pushsection ".idmap.text", "a"
>> +SYM_TYPED_FUNC_START(bbml2_wait_for_repainting)
>> +	swapper_ttb	.req	x0
>> +	flag_ptr	.req	x1
>> +
>> +	mrs	swapper_ttb, ttbr1_el1
>> +	adr_l	flag_ptr, repaint_done
>> +
>> +	/* Uninstall swapper before surgery begins */
>> +	__idmap_cpu_set_reserved_ttbr1 x16, x17
>> +
>> +	/* Increment the flag to let the boot CPU we're ready */
>> +1:	ldxr	w16, [flag_ptr]
>> +	add	w16, w16, #1
>> +	stxr	w17, w16, [flag_ptr]
>> +	cbnz	w17, 1b
>> +
>> +	/* Wait for the boot CPU to finish repainting */
>> +	sevl
>> +1:	wfe
>> +	ldxr	w16, [flag_ptr]
>> +	cbnz	w16, 1b
>> +
>> +	/* All done, act like nothing happened */
>> +	msr	ttbr1_el1, swapper_ttb
>> +	isb
>> +	ret
>> +
>> +	.unreq	swapper_ttb
>> +	.unreq	flag_ptr
>> +SYM_FUNC_END(bbml2_wait_for_repainting)
>> +	.popsection
> This is identical to __idmap_kpti_secondary. Can't we just refactor it into a
> common function? I think you can even reuse the same refcount variable (i.e. no
> need for both repaint_done and __idmap_kpti_flag).

I think we can extract the below busy-loop logic into a macro:

         .macro wait_for_boot_cpu, swapper_ttb, flag_ptr
         /* Uninstall swapper before surgery begins */
         __idmap_cpu_set_reserved_ttbr1 x16, x17

         /* Increment the flag to let the boot CPU we're ready */
1:      ldxr    w16, [flag_ptr]
         add     w16, w16, #1
         stxr    w17, w16, [flag_ptr]
         cbnz    w17, 1b

         /* Wait for the boot CPU to finish messing around with swapper */
         sevl
1:      wfe
         ldxr    w16, [flag_ptr]
         cbnz    w16, 1b

         /* All done, act like nothing happened */
         msr     ttbr1_el1, swapper_ttb
         isb
         ret
         .endm

Then kpti and repainting can just call this macro so that we don't have 
to keep both them use the same registers for swapper_ttb and flag_ptr 
parameters.

Thanks,
Yang


>
> Thanks,
> Ryan
>
>
>> +
>>   /*
>>    *	__cpu_setup
>>    *