linux-kernel - Re: [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not supported on secondary CPUs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3bc77e2c-66d6-40d3-a30f-07139e8e8f43@arm.com>
Date: Mon, 23 Jun 2025 13:26:24 +0100
From: Ryan Roberts <ryan.roberts@....com>
To: Yang Shi <yang@...amperecomputing.com>, will@...nel.org,
 catalin.marinas@....com, Miko.Lenczewski@....com, dev.jain@....com,
 scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not
 supported on secondary CPUs

On 31/05/2025 03:41, Yang Shi wrote:
> The kernel linear mapping is painted in very early stage of system boot.
> The cpufeature has not been finalized yet at this point.  So the linear
> mapping is determined by the capability of boot CPU.  If the boot CPU
> supports BBML2, large block mapping will be used for linear mapping.
> 
> But the secondary CPUs may not support BBML2, so repaint the linear mapping
> if large block mapping is used and the secondary CPUs don't support BBML2
> once cpufeature is finalized on all CPUs.
> 
> If the boot CPU doesn't support BBML2 or the secondary CPUs have the
> same BBML2 capability with the boot CPU, repainting the linear mapping
> is not needed.
> 
> Signed-off-by: Yang Shi <yang@...amperecomputing.com>
> ---
>  arch/arm64/include/asm/mmu.h   |   3 +
>  arch/arm64/kernel/cpufeature.c |  16 +++++
>  arch/arm64/mm/mmu.c            | 108 ++++++++++++++++++++++++++++++++-
>  arch/arm64/mm/proc.S           |  41 +++++++++++++
>  4 files changed, 166 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 2693d63bf837..ad38135d1aa1 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -56,6 +56,8 @@ typedef struct {
>   */
>  #define ASID(mm)	(atomic64_read(&(mm)->context.id) & 0xffff)
>  
> +extern bool block_mapping;

Is this really useful to cache? Why not just call force_pte_mapping() instead?
Its the inverse. It's also not a great name for a global variable.

But perhaps it is better to cache a boolean that also reflects the bbml2 status:

bool linear_map_requires_bbml2;

Then create_idmap() will only bother to add to the idmap if there is a chance
you will need to repaint. And repaint_linear_mappings() won't need to explcitly
check !rodata_full.

I think this can be __initdata too?

> +
>  static inline bool arm64_kernel_unmapped_at_el0(void)
>  {
>  	return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0);
> @@ -72,6 +74,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>  extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>  extern void mark_linear_text_alias_ro(void);
>  extern int split_linear_mapping(unsigned long start, unsigned long end);
> +extern int __repaint_linear_mappings(void *__unused);

nit: "repaint_linear_mappings" is a bit vague. How about
linear_map_split_to_ptes() or similar?

>  
>  /*
>   * This check is triggered during the early boot before the cpufeature
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 5fc2a4a804de..5151c101fbaf 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -85,6 +85,7 @@
>  #include <asm/insn.h>
>  #include <asm/kvm_host.h>
>  #include <asm/mmu_context.h>
> +#include <asm/mmu.h>
>  #include <asm/mte.h>
>  #include <asm/hypervisor.h>
>  #include <asm/processor.h>
> @@ -2005,6 +2006,20 @@ static int __init __kpti_install_ng_mappings(void *__unused)
>  	return 0;
>  }
>  
> +static void __init repaint_linear_mappings(void)
> +{
> +	if (!block_mapping)
> +		return;
> +
> +	if (!rodata_full)
> +		return;
> +
> +	if (system_supports_bbml2_noabort())
> +		return;
> +
> +	stop_machine(__repaint_linear_mappings, NULL, cpu_online_mask);

With the above suggestions, I think this can be simplified to something like:

static void __init linear_map_maybe_split_to_ptes(void)
{
	if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort())
		stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
}

> +}
> +
>  static void __init kpti_install_ng_mappings(void)
>  {
>  	/* Check whether KPTI is going to be used */
> @@ -3868,6 +3883,7 @@ void __init setup_system_features(void)
>  {
>  	setup_system_capabilities();
>  
> +	repaint_linear_mappings();
>  	kpti_install_ng_mappings();
>  
>  	sve_setup();
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 4c5d3aa35d62..3922af89abbb 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -209,6 +209,8 @@ static void split_pmd(pmd_t pmd, phys_addr_t pte_phys, int flags)
>  	/* It must be naturally aligned if PMD is leaf */
>  	if ((flags & NO_CONT_MAPPINGS) == 0)
>  		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> +	else
> +		prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
>  
>  	for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>  		__set_pte_nosync(ptep, pfn_pte(pfn, prot));
> @@ -230,6 +232,8 @@ static void split_pud(pud_t pud, phys_addr_t pmd_phys, int flags)
>  	/* It must be naturally aligned if PUD is leaf */
>  	if ((flags & NO_CONT_MAPPINGS) == 0)
>  		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> +	else
> +		prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
>  
>  	for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
>  		__set_pmd_nosync(pmdp, pfn_pmd(pfn, prot));
> @@ -833,6 +837,86 @@ void __init mark_linear_text_alias_ro(void)
>  			    PAGE_KERNEL_RO);
>  }
>  
> +static phys_addr_t repaint_pgtable_alloc(int shift)
> +{
> +	void *ptr;
> +
> +	ptr = (void *)__get_free_page(GFP_ATOMIC);
> +	if (!ptr)
> +		return -1;
> +
> +	return __pa(ptr);
> +}
> +
> +extern u32 repaint_done;
> +
> +int __init __repaint_linear_mappings(void *__unused)
> +{
> +	typedef void (repaint_wait_fn)(void);
> +	extern repaint_wait_fn bbml2_wait_for_repainting;
> +	repaint_wait_fn *wait_fn;
> +
> +	phys_addr_t kernel_start = __pa_symbol(_stext);
> +	phys_addr_t kernel_end = __pa_symbol(__init_begin);
> +	phys_addr_t start, end;
> +	unsigned long vstart, vend;
> +	u64 i;
> +	int ret;
> +	int flags = NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS |
> +		    SPLIT_MAPPINGS;
> +	int cpu = smp_processor_id();

nit: most of these variables are only needed by cpu 0 so you could defer their
initialization until inside the if condition below.

> +
> +	wait_fn = (void *)__pa_symbol(bbml2_wait_for_repainting);
> +
> +	/*
> +	 * Repainting just can be run on CPU 0 because we just can be sure
> +	 * CPU 0 supports BBML2.
> +	 */
> +	if (!cpu) {
> +		/*
> +		 * Wait for all secondary CPUs get prepared for repainting
> +		 * the linear mapping.
> +		 */
> +wait_for_secondary:
> +		if (READ_ONCE(repaint_done) != num_online_cpus())
> +			goto wait_for_secondary;

This feels suspect when comparing against the assembly code that does a similar
sync operation in idmap_kpti_install_ng_mappings:

	/* We're the boot CPU. Wait for the others to catch up */
	sevl
1:	wfe
	ldaxr	w17, [flag_ptr]
	eor	w17, w17, num_cpus
	cbnz	w17, 1b

The acquire semantics of the ldaxr are needed here to ensure that program-order
future memory accesses don't get reordered before. READ_ONCE() is relaxed so
permits reordering.

The wfe means the CPU is not just furiously spinning, but actually waiting for a
secondary cpu exclusively write to the variable at flag_ptr.

I think you can drop the whole loop and just call:

	smp_cond_load_acquire(&repaint_done, VAL == num_online_cpus());

> +
> +		memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
> +		/* Split the whole linear mapping */
> +		for_each_mem_range(i, &start, &end) {
> +			if (start >= end)
> +				return -EINVAL;
> +
> +			vstart = __phys_to_virt(start);
> +			vend = __phys_to_virt(end);
> +			ret = __create_pgd_mapping_locked(init_mm.pgd, start,
> +					vstart, (end - start), __pgprot(0),
> +					repaint_pgtable_alloc, flags);
> +			if (ret)
> +				panic("Failed to split linear mappings\n");
> +
> +			flush_tlb_kernel_range(vstart, vend);
> +		}
> +		memblock_clear_nomap(kernel_start, kernel_end - kernel_start);

You're relyng on the memblock API here. Is that valid, given we are quite late
into the boot at this point and we have transferred control to the buddy?

I was thinking you would just need to traverse the the linear map region of the
kernel page table, splitting each large leaf you find into the next size down?

> +
> +		WRITE_ONCE(repaint_done, 0);

I think this depends on the dsb(ish) in flush_tlb_kernel_range() to ensure it is
not re-ordered before any pgtable split operations? Might be worth a comment.


> +	} else {
> +		/*
> +		 * The secondary CPUs can't run in the same address space
> +		 * with CPU 0 because accessing the linear mapping address
> +		 * when CPU 0 is repainting it is not safe.
> +		 *
> +		 * Let the secondary CPUs run busy loop in idmap address
> +		 * space when repainting is ongoing.
> +		 */
> +		cpu_install_idmap();
> +		wait_fn();
> +		cpu_uninstall_idmap();
> +	}
> +
> +	return 0;
> +}
> +
>  #ifdef CONFIG_KFENCE
>  
>  bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
> @@ -887,6 +971,8 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>  
>  #endif /* CONFIG_KFENCE */
>  
> +bool block_mapping;
> +>  static inline bool force_pte_mapping(void)
>  {
>  	/*
> @@ -915,6 +1001,8 @@ static void __init map_mem(pgd_t *pgdp)
>  	int flags = NO_EXEC_MAPPINGS;
>  	u64 i;
>  
> +	block_mapping = true;
> +
>  	/*
>  	 * Setting hierarchical PXNTable attributes on table entries covering
>  	 * the linear region is only possible if it is guaranteed that no table
> @@ -930,8 +1018,10 @@ static void __init map_mem(pgd_t *pgdp)
>  
>  	early_kfence_pool = arm64_kfence_alloc_pool();
>  
> -	if (force_pte_mapping())
> +	if (force_pte_mapping()) {
> +		block_mapping = false;
>  		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> +	}
>  
>  	/*
>  	 * Take care not to create a writable alias for the
> @@ -1063,7 +1153,8 @@ void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
>  		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
>  
>  static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
> -	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
> +	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
> +	  bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
>  
>  static void __init create_idmap(void)
>  {
> @@ -1088,6 +1179,19 @@ static void __init create_idmap(void)
>  			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
>  			       __phys_to_virt(ptep) - ptep);
>  	}
> +
> +	/*
> +	 * Setup idmap mapping for repaint_done flag.  It will be used if
> +	 * repainting the linear mapping is needed later.
> +	 */
> +	if (block_mapping) {
> +		u64 pa = __pa_symbol(&repaint_done);
> +		ptep = __pa_symbol(bbml2_ptes);
> +
> +		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
> +			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
> +			       __phys_to_virt(ptep) - ptep);
> +	}
>  }
>  
>  void __init paging_init(void)
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index fb30c8804f87..c40e6126c093 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -440,6 +440,47 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings)
>  	.popsection
>  #endif
>  
> +/*
> + * Wait for repainting is done. Run on secondary CPUs
> + * only.
> + */
> +	.pushsection	".data", "aw", %progbits
> +SYM_DATA(repaint_done, .long 1)
> +	.popsection
> +
> +	.pushsection ".idmap.text", "a"
> +SYM_TYPED_FUNC_START(bbml2_wait_for_repainting)
> +	swapper_ttb	.req	x0
> +	flag_ptr	.req	x1
> +
> +	mrs	swapper_ttb, ttbr1_el1
> +	adr_l	flag_ptr, repaint_done
> +
> +	/* Uninstall swapper before surgery begins */
> +	__idmap_cpu_set_reserved_ttbr1 x16, x17
> +
> +	/* Increment the flag to let the boot CPU we're ready */
> +1:	ldxr	w16, [flag_ptr]
> +	add	w16, w16, #1
> +	stxr	w17, w16, [flag_ptr]
> +	cbnz	w17, 1b
> +
> +	/* Wait for the boot CPU to finish repainting */
> +	sevl
> +1:	wfe
> +	ldxr	w16, [flag_ptr]
> +	cbnz	w16, 1b
> +
> +	/* All done, act like nothing happened */
> +	msr	ttbr1_el1, swapper_ttb
> +	isb
> +	ret
> +
> +	.unreq	swapper_ttb
> +	.unreq	flag_ptr
> +SYM_FUNC_END(bbml2_wait_for_repainting)
> +	.popsection

This is identical to __idmap_kpti_secondary. Can't we just refactor it into a
common function? I think you can even reuse the same refcount variable (i.e. no
need for both repaint_done and __idmap_kpti_flag).

Thanks,
Ryan


> +
>  /*
>   *	__cpu_setup
>   *