[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d6d7e038-aa94-491a-8ad6-f48541b98b6a@os.amperecomputing.com>
Date: Thu, 6 Nov 2025 12:46:45 -0800
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, catalin.marinas@....com,
will@...nel.org, david@...hat.com, ardb@...nel.org, dev.jain@....com,
scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org,
Guenter Roeck <groeck@...gle.com>
Subject: Re: [PATCH v2 1/3] arm64: mm: Don't sleep in
split_kernel_leaf_mapping() when in atomic context
On 11/6/25 8:09 AM, Ryan Roberts wrote:
> It has been reported that split_kernel_leaf_mapping() is trying to sleep
> in non-sleepable context. It does this when acquiring the
> pgtable_split_lock mutex, when either CONFIG_DEBUG_PAGEALLOC or
> CONFIG_KFENCE are enabled, which change linear map permissions within
> softirq context during memory allocation and/or freeing. All other paths
> into this function are called from sleepable context and so are safe.
>
> But it turns out that the memory for which these 2 features may attempt
> to modify the permissions is always mapped by pte, so there is no need
> to attempt to split the mapping. So let's exit early in these cases and
> avoid attempting to take the mutex.
>
> There is one wrinkle to this approach; late-initialized kfence allocates
> it's pool from the buddy which may be block mapped. So we must hook that
> allocation and convert it to pte-mappings up front. Previously this was
> done as a side-effect of kfence protecting all the individual pages in
> its pool at init-time, but this no longer works due to the added early
> exit path in split_kernel_leaf_mapping().
>
> So instead, do this via the existing arch_kfence_init_pool() arch hook,
> and reuse the existing linear_map_split_to_ptes() infrastructure.
>
> Closes: https://lore.kernel.org/all/f24b9032-0ec9-47b1-8b95-c0eeac7a31c5@roeck-us.net/
> Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full")
> Tested-by: Guenter Roeck <groeck@...gle.com>
> Signed-off-by: Ryan Roberts <ryan.roberts@....com>
Reviewed-by: Yang Shi <yang@...amperecomputing.com>
Just a nit below:
> ---
> arch/arm64/include/asm/kfence.h | 3 +-
> arch/arm64/mm/mmu.c | 92 +++++++++++++++++++++++----------
> 2 files changed, 67 insertions(+), 28 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
> index a81937fae9f6..21dbc9dda747 100644
> --- a/arch/arm64/include/asm/kfence.h
> +++ b/arch/arm64/include/asm/kfence.h
> @@ -10,8 +10,6 @@
>
> #include <asm/set_memory.h>
>
> -static inline bool arch_kfence_init_pool(void) { return true; }
> -
> static inline bool kfence_protect_page(unsigned long addr, bool protect)
> {
> set_memory_valid(addr, 1, !protect);
> @@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void)
> {
> return !kfence_early_init;
> }
> +bool arch_kfence_init_pool(void);
> #else /* CONFIG_KFENCE */
> static inline bool arm64_kfence_can_set_direct_map(void) { return false; }
> #endif /* CONFIG_KFENCE */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index b8d37eb037fc..a364ac2c9c61 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -708,6 +708,16 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
> return ret;
> }
>
> +static inline bool force_pte_mapping(void)
> +{
> + bool bbml2 = system_capabilities_finalized() ?
> + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
> +
> + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
> + is_realm_world())) ||
> + debug_pagealloc_enabled();
> +}
> +
> static DEFINE_MUTEX(pgtable_split_lock);
>
> int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
> @@ -723,6 +733,16 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
> if (!system_supports_bbml2_noabort())
> return 0;
>
> + /*
> + * If the region is within a pte-mapped area, there is no need to try to
> + * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
> + * change permissions from atomic context so for those cases (which are
> + * always pte-mapped), we must not go any further because taking the
> + * mutex below may sleep.
The path 3 changed the comment, but since patch 3 just does some cleanup
and code deduplication, there is no functional change, so why not just
use the comment from patch 3?
Thanks,
Yang
> + */
> + if (force_pte_mapping() || is_kfence_address((void *)start))
> + return 0;
> +
> /*
> * Ensure start and end are at least page-aligned since this is the
> * finest granularity we can split to.
> @@ -758,30 +778,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
> return ret;
> }
>
> -static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> - unsigned long next,
> - struct mm_walk *walk)
> +static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> + unsigned long next, struct mm_walk *walk)
> {
> + gfp_t gfp = *(gfp_t *)walk->private;
> pud_t pud = pudp_get(pudp);
> int ret = 0;
>
> if (pud_leaf(pud))
> - ret = split_pud(pudp, pud, GFP_ATOMIC, false);
> + ret = split_pud(pudp, pud, gfp, false);
>
> return ret;
> }
>
> -static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> - unsigned long next,
> - struct mm_walk *walk)
> +static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> + unsigned long next, struct mm_walk *walk)
> {
> + gfp_t gfp = *(gfp_t *)walk->private;
> pmd_t pmd = pmdp_get(pmdp);
> int ret = 0;
>
> if (pmd_leaf(pmd)) {
> if (pmd_cont(pmd))
> split_contpmd(pmdp);
> - ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
> + ret = split_pmd(pmdp, pmd, gfp, false);
>
> /*
> * We have split the pmd directly to ptes so there is no need to
> @@ -793,9 +813,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> return ret;
> }
>
> -static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
> - unsigned long next,
> - struct mm_walk *walk)
> +static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
> + unsigned long next, struct mm_walk *walk)
> {
> pte_t pte = __ptep_get(ptep);
>
> @@ -805,12 +824,18 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
> return 0;
> }
>
> -static const struct mm_walk_ops split_to_ptes_ops __initconst = {
> +static const struct mm_walk_ops split_to_ptes_ops = {
> .pud_entry = split_to_ptes_pud_entry,
> .pmd_entry = split_to_ptes_pmd_entry,
> .pte_entry = split_to_ptes_pte_entry,
> };
>
> +static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
> +{
> + return walk_kernel_page_table_range_lockless(start, end,
> + &split_to_ptes_ops, NULL, &gfp);
> +}
> +
> static bool linear_map_requires_bbml2 __initdata;
>
> u32 idmap_kpti_bbml2_flag;
> @@ -847,11 +872,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
> * PTE. The kernel alias remains static throughout runtime so
> * can continue to be safely mapped with large mappings.
> */
> - ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> - &split_to_ptes_ops, NULL, NULL);
> + ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
> if (!ret)
> - ret = walk_kernel_page_table_range_lockless(kend, lend,
> - &split_to_ptes_ops, NULL, NULL);
> + ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
> if (ret)
> panic("Failed to split linear map\n");
> flush_tlb_kernel_range(lstart, lend);
> @@ -1002,6 +1025,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
> memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
> __kfence_pool = phys_to_virt(kfence_pool);
> }
> +
> +bool arch_kfence_init_pool(void)
> +{
> + unsigned long start = (unsigned long)__kfence_pool;
> + unsigned long end = start + KFENCE_POOL_SIZE;
> + int ret;
> +
> + /* Exit early if we know the linear map is already pte-mapped. */
> + if (!system_supports_bbml2_noabort() || force_pte_mapping())
> + return true;
> +
> + /* Kfence pool is already pte-mapped for the early init case. */
> + if (kfence_early_init)
> + return true;
> +
> + mutex_lock(&pgtable_split_lock);
> + ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
> + mutex_unlock(&pgtable_split_lock);
> +
> + /*
> + * Since the system supports bbml2_noabort, tlb invalidation is not
> + * required here; the pgtable mappings have been split to pte but larger
> + * entries may safely linger in the TLB.
> + */
> +
> + return !ret;
> +}
> #else /* CONFIG_KFENCE */
>
> static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
> @@ -1009,16 +1059,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>
> #endif /* CONFIG_KFENCE */
>
> -static inline bool force_pte_mapping(void)
> -{
> - bool bbml2 = system_capabilities_finalized() ?
> - system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
> -
> - return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
> - is_realm_world())) ||
> - debug_pagealloc_enabled();
> -}
> -
> static void __init map_mem(pgd_t *pgdp)
> {
> static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
Powered by blists - more mailing lists