linux-kernel - Re: [PATCH v2 1/3] arm64: mm: Don't sleep in split_kernel_leaf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d6d7e038-aa94-491a-8ad6-f48541b98b6a@os.amperecomputing.com>
Date: Thu, 6 Nov 2025 12:46:45 -0800
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, catalin.marinas@....com,
 will@...nel.org, david@...hat.com, ardb@...nel.org, dev.jain@....com,
 scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org,
 Guenter Roeck <groeck@...gle.com>
Subject: Re: [PATCH v2 1/3] arm64: mm: Don't sleep in
 split_kernel_leaf_mapping() when in atomic context



On 11/6/25 8:09 AM, Ryan Roberts wrote:
> It has been reported that split_kernel_leaf_mapping() is trying to sleep
> in non-sleepable context. It does this when acquiring the
> pgtable_split_lock mutex, when either CONFIG_DEBUG_PAGEALLOC or
> CONFIG_KFENCE are enabled, which change linear map permissions within
> softirq context during memory allocation and/or freeing. All other paths
> into this function are called from sleepable context and so are safe.
>
> But it turns out that the memory for which these 2 features may attempt
> to modify the permissions is always mapped by pte, so there is no need
> to attempt to split the mapping. So let's exit early in these cases and
> avoid attempting to take the mutex.
>
> There is one wrinkle to this approach; late-initialized kfence allocates
> it's pool from the buddy which may be block mapped. So we must hook that
> allocation and convert it to pte-mappings up front. Previously this was
> done as a side-effect of kfence protecting all the individual pages in
> its pool at init-time, but this no longer works due to the added early
> exit path in split_kernel_leaf_mapping().
>
> So instead, do this via the existing arch_kfence_init_pool() arch hook,
> and reuse the existing linear_map_split_to_ptes() infrastructure.
>
> Closes: https://lore.kernel.org/all/f24b9032-0ec9-47b1-8b95-c0eeac7a31c5@roeck-us.net/
> Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full")
> Tested-by: Guenter Roeck <groeck@...gle.com>
> Signed-off-by: Ryan Roberts <ryan.roberts@....com>

Reviewed-by: Yang Shi <yang@...amperecomputing.com>

Just a nit below:

> ---
>   arch/arm64/include/asm/kfence.h |  3 +-
>   arch/arm64/mm/mmu.c             | 92 +++++++++++++++++++++++----------
>   2 files changed, 67 insertions(+), 28 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
> index a81937fae9f6..21dbc9dda747 100644
> --- a/arch/arm64/include/asm/kfence.h
> +++ b/arch/arm64/include/asm/kfence.h
> @@ -10,8 +10,6 @@
>   
>   #include <asm/set_memory.h>
>   
> -static inline bool arch_kfence_init_pool(void) { return true; }
> -
>   static inline bool kfence_protect_page(unsigned long addr, bool protect)
>   {
>   	set_memory_valid(addr, 1, !protect);
> @@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void)
>   {
>   	return !kfence_early_init;
>   }
> +bool arch_kfence_init_pool(void);
>   #else /* CONFIG_KFENCE */
>   static inline bool arm64_kfence_can_set_direct_map(void) { return false; }
>   #endif /* CONFIG_KFENCE */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index b8d37eb037fc..a364ac2c9c61 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -708,6 +708,16 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>   	return ret;
>   }
>   
> +static inline bool force_pte_mapping(void)
> +{
> +	bool bbml2 = system_capabilities_finalized() ?
> +		system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
> +
> +	return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
> +			   is_realm_world())) ||
> +		debug_pagealloc_enabled();
> +}
> +
>   static DEFINE_MUTEX(pgtable_split_lock);
>   
>   int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
> @@ -723,6 +733,16 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
>   	if (!system_supports_bbml2_noabort())
>   		return 0;
>   
> +	/*
> +	 * If the region is within a pte-mapped area, there is no need to try to
> +	 * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
> +	 * change permissions from atomic context so for those cases (which are
> +	 * always pte-mapped), we must not go any further because taking the
> +	 * mutex below may sleep.

The path 3 changed the comment, but since patch 3 just does some cleanup 
and code deduplication, there is no functional change, so why not just 
use the comment from patch 3?

Thanks,
Yang

> +	 */
> +	if (force_pte_mapping() || is_kfence_address((void *)start))
> +		return 0;
> +
>   	/*
>   	 * Ensure start and end are at least page-aligned since this is the
>   	 * finest granularity we can split to.
> @@ -758,30 +778,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
>   	return ret;
>   }
>   
> -static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> -					  unsigned long next,
> -					  struct mm_walk *walk)
> +static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> +				   unsigned long next, struct mm_walk *walk)
>   {
> +	gfp_t gfp = *(gfp_t *)walk->private;
>   	pud_t pud = pudp_get(pudp);
>   	int ret = 0;
>   
>   	if (pud_leaf(pud))
> -		ret = split_pud(pudp, pud, GFP_ATOMIC, false);
> +		ret = split_pud(pudp, pud, gfp, false);
>   
>   	return ret;
>   }
>   
> -static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> -					  unsigned long next,
> -					  struct mm_walk *walk)
> +static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> +				   unsigned long next, struct mm_walk *walk)
>   {
> +	gfp_t gfp = *(gfp_t *)walk->private;
>   	pmd_t pmd = pmdp_get(pmdp);
>   	int ret = 0;
>   
>   	if (pmd_leaf(pmd)) {
>   		if (pmd_cont(pmd))
>   			split_contpmd(pmdp);
> -		ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
> +		ret = split_pmd(pmdp, pmd, gfp, false);
>   
>   		/*
>   		 * We have split the pmd directly to ptes so there is no need to
> @@ -793,9 +813,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
>   	return ret;
>   }
>   
> -static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
> -					  unsigned long next,
> -					  struct mm_walk *walk)
> +static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
> +				   unsigned long next, struct mm_walk *walk)
>   {
>   	pte_t pte = __ptep_get(ptep);
>   
> @@ -805,12 +824,18 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
>   	return 0;
>   }
>   
> -static const struct mm_walk_ops split_to_ptes_ops __initconst = {
> +static const struct mm_walk_ops split_to_ptes_ops = {
>   	.pud_entry	= split_to_ptes_pud_entry,
>   	.pmd_entry	= split_to_ptes_pmd_entry,
>   	.pte_entry	= split_to_ptes_pte_entry,
>   };
>   
> +static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
> +{
> +	return walk_kernel_page_table_range_lockless(start, end,
> +					&split_to_ptes_ops, NULL, &gfp);
> +}
> +
>   static bool linear_map_requires_bbml2 __initdata;
>   
>   u32 idmap_kpti_bbml2_flag;
> @@ -847,11 +872,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
>   		 * PTE. The kernel alias remains static throughout runtime so
>   		 * can continue to be safely mapped with large mappings.
>   		 */
> -		ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> -						&split_to_ptes_ops, NULL, NULL);
> +		ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
>   		if (!ret)
> -			ret = walk_kernel_page_table_range_lockless(kend, lend,
> -						&split_to_ptes_ops, NULL, NULL);
> +			ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
>   		if (ret)
>   			panic("Failed to split linear map\n");
>   		flush_tlb_kernel_range(lstart, lend);
> @@ -1002,6 +1025,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
>   	memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
>   	__kfence_pool = phys_to_virt(kfence_pool);
>   }
> +
> +bool arch_kfence_init_pool(void)
> +{
> +	unsigned long start = (unsigned long)__kfence_pool;
> +	unsigned long end = start + KFENCE_POOL_SIZE;
> +	int ret;
> +
> +	/* Exit early if we know the linear map is already pte-mapped. */
> +	if (!system_supports_bbml2_noabort() || force_pte_mapping())
> +		return true;
> +
> +	/* Kfence pool is already pte-mapped for the early init case. */
> +	if (kfence_early_init)
> +		return true;
> +
> +	mutex_lock(&pgtable_split_lock);
> +	ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
> +	mutex_unlock(&pgtable_split_lock);
> +
> +	/*
> +	 * Since the system supports bbml2_noabort, tlb invalidation is not
> +	 * required here; the pgtable mappings have been split to pte but larger
> +	 * entries may safely linger in the TLB.
> +	 */
> +
> +	return !ret;
> +}
>   #else /* CONFIG_KFENCE */
>   
>   static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
> @@ -1009,16 +1059,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>   
>   #endif /* CONFIG_KFENCE */
>   
> -static inline bool force_pte_mapping(void)
> -{
> -	bool bbml2 = system_capabilities_finalized() ?
> -		system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
> -
> -	return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
> -			   is_realm_world())) ||
> -		debug_pagealloc_enabled();
> -}
> -
>   static void __init map_mem(pgd_t *pgdp)
>   {
>   	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);