lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d7072748-8bf4-46c9-86da-ee698e494288@os.amperecomputing.com>
Date: Tue, 5 Aug 2025 11:59:17 -0700
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, will@...nel.org,
 catalin.marinas@....com, akpm@...ux-foundation.org, Miko.Lenczewski@....com,
 dev.jain@....com, scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not
 supported on secondary CPUs



On 8/1/25 9:14 AM, Ryan Roberts wrote:
> On 24/07/2025 23:11, Yang Shi wrote:
>> The kernel linear mapping is painted in very early stage of system boot.
>> The cpufeature has not been finalized yet at this point.  So the linear
>> mapping is determined by the capability of boot CPU.  If the boot CPU
>> supports BBML2, large block mapping will be used for linear mapping.
>>
>> But the secondary CPUs may not support BBML2, so repaint the linear mapping
>> if large block mapping is used and the secondary CPUs don't support BBML2
>> once cpufeature is finalized on all CPUs.
>>
>> If the boot CPU doesn't support BBML2 or the secondary CPUs have the
>> same BBML2 capability with the boot CPU, repainting the linear mapping
>> is not needed.
>>
>> Signed-off-by: Yang Shi <yang@...amperecomputing.com>
>> ---
>>   arch/arm64/include/asm/mmu.h   |   6 +-
>>   arch/arm64/kernel/cpufeature.c |   8 ++
>>   arch/arm64/mm/mmu.c            | 173 +++++++++++++++++++++++++++------
>>   arch/arm64/mm/pageattr.c       |   2 +-
>>   arch/arm64/mm/proc.S           |  57 ++++++++---
>>   5 files changed, 196 insertions(+), 50 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 57f4b25e6f33..9bf50e8897e2 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -56,6 +56,8 @@ typedef struct {
>>    */
>>   #define ASID(mm)	(atomic64_read(&(mm)->context.id) & 0xffff)
>>   
>> +extern bool linear_map_requires_bbml2;
>> +
>>   static inline bool arm64_kernel_unmapped_at_el0(void)
>>   {
>>   	return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0);
>> @@ -71,7 +73,9 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>>   			       pgprot_t prot, bool page_mappings_only);
>>   extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>>   extern void mark_linear_text_alias_ro(void);
>> -extern int split_kernel_pgtable_mapping(unsigned long start, unsigned long end);
>> +extern int split_kernel_pgtable_mapping(unsigned long start, unsigned long end,
>> +					unsigned int flags);
>> +extern int linear_map_split_to_ptes(void *__unused);
>>   
>>   /*
>>    * This check is triggered during the early boot before the cpufeature
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 1c96016a7a41..23c01d679c40 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -85,6 +85,7 @@
>>   #include <asm/insn.h>
>>   #include <asm/kvm_host.h>
>>   #include <asm/mmu_context.h>
>> +#include <asm/mmu.h>
>>   #include <asm/mte.h>
>>   #include <asm/hypervisor.h>
>>   #include <asm/processor.h>
>> @@ -2009,6 +2010,12 @@ static int __init __kpti_install_ng_mappings(void *__unused)
>>   	return 0;
>>   }
>>   
>> +static void __init linear_map_maybe_split_to_ptes(void)
>> +{
>> +	if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort())
>> +		stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
>> +}
>> +
>>   static void __init kpti_install_ng_mappings(void)
>>   {
>>   	/* Check whether KPTI is going to be used */
>> @@ -3855,6 +3862,7 @@ void __init setup_system_features(void)
>>   {
>>   	setup_system_capabilities();
>>   
>> +	linear_map_maybe_split_to_ptes();
>>   	kpti_install_ng_mappings();
>>   
>>   	sve_setup();
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index f63b39613571..22f2d0869fdd 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -482,11 +482,11 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>>   
>>   #define INVALID_PHYS_ADDR	-1
>>   
> [...]
>
> I'll review the actual walker separately (I've run out of time today).
>
>
>>   
>> +extern u32 repaint_done;
>> +
>> +int __init linear_map_split_to_ptes(void *__unused)
>> +{
>> +	typedef void (repaint_wait_fn)(void);
>> +	extern repaint_wait_fn bbml2_wait_for_repainting;
>> +	repaint_wait_fn *wait_fn;
>> +
>> +	int cpu = smp_processor_id();
>> +
>> +	wait_fn = (void *)__pa_symbol(bbml2_wait_for_repainting);
>> +
>> +	/*
>> +	 * Repainting just can be run on CPU 0 because we just can be sure
>> +	 * CPU 0 supports BBML2.
>> +	 */
>> +	if (!cpu) {
>> +		phys_addr_t kernel_start = __pa_symbol(_stext);
>> +		phys_addr_t kernel_end = __pa_symbol(__init_begin);
>> +		phys_addr_t start, end;
>> +		unsigned long vstart, vend;
>> +		int flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> +		u64 i;
>> +		int ret;
>> +
>> +		/*
>> +		 * Wait for all secondary CPUs get prepared for repainting
>> +		 * the linear mapping.
>> +		 */
>> +		smp_cond_load_acquire(&repaint_done, VAL == num_online_cpus());
> Is this correct? I would have thought the primary is waiting for the
> secondaries, so "VAL == num_online_cpus() - 1" ?

It is correct because repaint_done is initialized to 1.

>
>> +
>> +		memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>> +		/* Split the whole linear mapping */
>> +		for_each_mem_range(i, &start, &end) {
> I think I asked this in the last round; but I just want to double check;
> memblock is definitely still valid here and we are definitely going to get
> exactly the same regions out as we did in map_mem()? I wonder if it's possible
> between then and now that some other component has reserved some memory? In that
> case we wouldn't walk that region?

I think it is kept unchanged. The ptdump shows no block or cont mappings 
for linear mapping if I force repaint the page table.

>
> Perhaps it would be safer (and simpler) to just walk all of [PAGE_OFFSET,
> _stext) and [__init_begin, PAGE_END) and ignore the holes?

This may walk some holes, particularly for multiple sockets machines? It 
doesn't have correctness issue, but just not very efficient.

>
>> +			if (start >= end)
>> +				return -EINVAL;
>> +
>> +			vstart = __phys_to_virt(start);
>> +			vend = __phys_to_virt(end);
>> +			ret = split_kernel_pgtable_mapping(vstart, vend, flags);
>> +			if (ret)
>> +				panic("Failed to split linear mappings\n");
>> +
>> +			flush_tlb_kernel_range(vstart, vend);
>> +		}
>> +		memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
>> +
>> +		/*
>> +		 * Relies on dsb in flush_tlb_kernel_range() to avoid
>> +		 * reordering before any page table split operations.
>> +		 */
>> +		WRITE_ONCE(repaint_done, 0);
>> +	} else {
>> +		/*
>> +		 * The secondary CPUs can't run in the same address space
>> +		 * with CPU 0 because accessing the linear mapping address
>> +		 * when CPU 0 is repainting it is not safe.
>> +		 *
>> +		 * Let the secondary CPUs run busy loop in idmap address
>> +		 * space when repainting is ongoing.
>> +		 */
>> +		cpu_install_idmap();
>> +		wait_fn();
>> +		cpu_uninstall_idmap();
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   #ifdef CONFIG_KFENCE
>>   
>>   bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
>> @@ -1079,7 +1174,8 @@ void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
>>   		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
>>   
>>   static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
>> -	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
>> +	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
>> +	  bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
>>   
>>   static void __init create_idmap(void)
>>   {
>> @@ -1104,6 +1200,19 @@ static void __init create_idmap(void)
>>   			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
>>   			       __phys_to_virt(ptep) - ptep);
>>   	}
>> +
>> +	/*
>> +	 * Setup idmap mapping for repaint_done flag.  It will be used if
>> +	 * repainting the linear mapping is needed later.
>> +	 */
>> +	if (linear_map_requires_bbml2) {
>> +		u64 pa = __pa_symbol(&repaint_done);
>> +		ptep = __pa_symbol(bbml2_ptes);
>> +
>> +		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
>> +			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
>> +			       __phys_to_virt(ptep) - ptep);
>> +	}
>>   }
>>   
>>   void __init paging_init(void)
>> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
>> index 6566aa9d8abb..4471d7e510a1 100644
>> --- a/arch/arm64/mm/pageattr.c
>> +++ b/arch/arm64/mm/pageattr.c
>> @@ -140,7 +140,7 @@ static int update_range_prot(unsigned long start, unsigned long size,
>>   	data.set_mask = set_mask;
>>   	data.clear_mask = clear_mask;
>>   
>> -	ret = split_kernel_pgtable_mapping(start, start + size);
>> +	ret = split_kernel_pgtable_mapping(start, start + size, 0);
>>   	if (WARN_ON_ONCE(ret))
>>   		return ret;
>>   
>> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
>> index 80d470aa469d..f0f9c49a4466 100644
>> --- a/arch/arm64/mm/proc.S
>> +++ b/arch/arm64/mm/proc.S
>> @@ -239,6 +239,25 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
>>   	dsb	nshst
>>   	.endm
>>   
>> +	.macro wait_for_boot_cpu, tmp1, tmp2, tmp3, tmp4
>> +	/* Increment the flag to let the boot CPU know we're ready */
>> +1:	ldxr	\tmp3, [\tmp2]
>> +	add	\tmp3, \tmp3, #1
>> +	stxr	\tmp4, \tmp3, [\tmp2]
>> +	cbnz	\tmp4, 1b
>> +
>> +	/* Wait for the boot CPU to finish its job */
>> +	sevl
>> +1:	wfe
>> +	ldxr	\tmp3, [\tmp2]
>> +	cbnz	\tmp3, 1b
>> +
>> +	/* All done, act like nothing happened */
>> +	msr	ttbr1_el1, \tmp1
>> +	isb
>> +	ret
>> +	.endm
> You've defined the macro within "#ifdef CONFIG_UNMAP_KERNEL_AT_EL0" but then
> need to use it outside of that scope.
>
> But I don't think this needs to be a macro; I think it would be better as a
> function (as I suggested in the last round). Then the text only needs to appear
> once in the image and it can be used from both places (see below).
>
>> +
>>   /*
>>    * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
>>    *				   unsigned long temp_pte_va)
>> @@ -416,29 +435,35 @@ alternative_else_nop_endif
>>   __idmap_kpti_secondary:
>>   	/* Uninstall swapper before surgery begins */
>>   	__idmap_cpu_set_reserved_ttbr1 x16, x17
>> +	wait_for_boot_cpu swapper_ttb, flag_ptr, w16, w17
>>   
>> -	/* Increment the flag to let the boot CPU we're ready */
>> -1:	ldxr	w16, [flag_ptr]
>> -	add	w16, w16, #1
>> -	stxr	w17, w16, [flag_ptr]
>> -	cbnz	w17, 1b
>> +	.unreq	swapper_ttb
>> +	.unreq	flag_ptr
>> +SYM_FUNC_END(idmap_kpti_install_ng_mappings)
>> +	.popsection
>> +#endif
>>   
>> -	/* Wait for the boot CPU to finish messing around with swapper */
>> -	sevl
>> -1:	wfe
>> -	ldxr	w16, [flag_ptr]
>> -	cbnz	w16, 1b
>> +/*
>> + * Wait for repainting is done. Run on secondary CPUs
>> + * only.
>> + */
>> +	.pushsection	".data", "aw", %progbits
>> +SYM_DATA(repaint_done, .long 1)
>> +	.popsection
>>   
>> -	/* All done, act like nothing happened */
>> -	msr	ttbr1_el1, swapper_ttb
>> -	isb
>> -	ret
>> +	.pushsection ".idmap.text", "a"
>> +SYM_TYPED_FUNC_START(bbml2_wait_for_repainting)
>> +	swapper_ttb	.req	x0
>> +	flag_ptr	.req	x1
>> +	mrs     swapper_ttb, ttbr1_el1
>> +	adr_l   flag_ptr, repaint_done
>> +	__idmap_cpu_set_reserved_ttbr1 x16, x17
>> +	wait_for_boot_cpu swapper_ttb, flag_ptr, w16, w17
>>   
>>   	.unreq	swapper_ttb
>>   	.unreq	flag_ptr
>> -SYM_FUNC_END(idmap_kpti_install_ng_mappings)
>> +SYM_FUNC_END(bbml2_wait_for_repainting)
>>   	.popsection
>> -#endif
> How about this instead?

Looks good to me.

Thanks,
Yang


>
> ---8<---
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 8c75965afc9e..a116b2b8ad59 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -416,8 +416,30 @@ alternative_else_nop_endif
>   __idmap_kpti_secondary:
>   	/* Uninstall swapper before surgery begins */
>   	__idmap_cpu_set_reserved_ttbr1 x16, x17
> +	b scondary_cpu_wait
> +
> +	.unreq	swapper_ttb
> +	.unreq	flag_ptr
> +SYM_FUNC_END(idmap_kpti_install_ng_mappings)
> +	.popsection
> +#endif
> +
> +	.pushsection	".data", "aw", %progbits
> +SYM_DATA(repaint_done, .long 1)
> +	.popsection
> +
> +	.pushsection ".idmap.text", "a"
> +SYM_TYPED_FUNC_START(bbml2_wait_for_repainting)
> +	/* Must be same registers as in idmap_kpti_install_ng_mappings */
> +	swapper_ttb	.req	x3
> +	flag_ptr	.req	x4
> +
> +	mrs     swapper_ttb, ttbr1_el1
> +	adr_l   flag_ptr, repaint_done
> +	__idmap_cpu_set_reserved_ttbr1 x16, x17
>
>   	/* Increment the flag to let the boot CPU we're ready */
> +scondary_cpu_wait:
>   1:	ldxr	w16, [flag_ptr]
>   	add	w16, w16, #1
>   	stxr	w17, w16, [flag_ptr]
> @@ -436,9 +458,8 @@ __idmap_kpti_secondary:
>
>   	.unreq	swapper_ttb
>   	.unreq	flag_ptr
> -SYM_FUNC_END(idmap_kpti_install_ng_mappings)
> +SYM_FUNC_END(bbml2_wait_for_repainting)
>   	.popsection
> -#endif
>
>   /*
>    *	__cpu_setup
> ---8<---
>
> Thanks,
> Ryan
>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ