[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2b715836-b566-4a9e-b344-9401fa4c0feb@os.amperecomputing.com>
Date: Mon, 17 Mar 2025 10:53:09 -0700
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, will@...nel.org,
catalin.marinas@....com, Miko.Lenczewski@....com,
scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [v3 PATCH 3/6] arm64: mm: make __create_pgd_mapping() and helpers
non-void
On 3/14/25 4:51 AM, Ryan Roberts wrote:
> On 04/03/2025 22:19, Yang Shi wrote:
>> The later patch will enhance __create_pgd_mapping() and related helpers
>> to split kernel linear mapping, it requires have return value. So make
>> __create_pgd_mapping() and helpers non-void functions.
>>
>> And move the BUG_ON() out of page table alloc helper since failing
>> splitting kernel linear mapping is not fatal and can be handled by the
>> callers in the later patch. Have BUG_ON() after
>> __create_pgd_mapping_locked() returns to keep the current callers behavior
>> intact.
>>
>> Suggested-by: Ryan Roberts<ryan.roberts@....com>
>> Signed-off-by: Yang Shi<yang@...amperecomputing.com>
>> ---
>> arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
>> 1 file changed, 86 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index b4df5bc5b1b8..dccf0877285b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -189,11 +189,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>> } while (ptep++, addr += PAGE_SIZE, addr != end);
>> }
>>
>> -static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> - unsigned long end, phys_addr_t phys,
>> - pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int),
>> - int flags)
>> +static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> + unsigned long end, phys_addr_t phys,
>> + pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int),
>> + int flags)
>> {
>> unsigned long next;
>> pmd_t pmd = READ_ONCE(*pmdp);
>> @@ -208,6 +208,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> pmdval |= PMD_TABLE_PXN;
>> BUG_ON(!pgtable_alloc);
>> pte_phys = pgtable_alloc(PAGE_SHIFT);
>> + if (!pte_phys)
>> + return -ENOMEM;
> nit: personally I'd prefer to see a "goto out" and funnel all to a single return
> statement. You do that in some functions (via loop break), but would be cleaner
> if consistent.
>
> If pgtable_alloc() is modified to return int (see my comment at the bottom),
> this becomes:
>
> ret = pgtable_alloc(PAGE_SHIFT, &pte_phys);
> if (ret)
> goto out;
OK
>> ptep = pte_set_fixmap(pte_phys);
>> init_clear_pgtable(ptep);
>> ptep += pte_index(addr);
>> @@ -239,13 +241,16 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> * walker.
>> */
>> pte_clear_fixmap();
>> +
>> + return 0;
>> }
>>
>> -static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> - phys_addr_t phys, pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> + phys_addr_t phys, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int), int flags)
>> {
>> unsigned long next;
>> + int ret = 0;
>>
>> do {
>> pmd_t old_pmd = READ_ONCE(*pmdp);
>> @@ -264,22 +269,27 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
>> READ_ONCE(pmd_val(*pmdp))));
>> } else {
>> - alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>> + ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>> pgtable_alloc, flags);
>> + if (ret)
>> + break;
>>
>> BUG_ON(pmd_val(old_pmd) != 0 &&
>> pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
>> }
>> phys += next - addr;
>> } while (pmdp++, addr = next, addr != end);
>> +
>> + return ret;
>> }
>>
>> -static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> - unsigned long end, phys_addr_t phys,
>> - pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> + unsigned long end, phys_addr_t phys,
>> + pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int), int flags)
>> {
>> unsigned long next;
>> + int ret = 0;
>> pud_t pud = READ_ONCE(*pudp);
>> pmd_t *pmdp;
>>
>> @@ -295,6 +305,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> pudval |= PUD_TABLE_PXN;
>> BUG_ON(!pgtable_alloc);
>> pmd_phys = pgtable_alloc(PMD_SHIFT);
>> + if (!pmd_phys)
>> + return -ENOMEM;
>> pmdp = pmd_set_fixmap(pmd_phys);
>> init_clear_pgtable(pmdp);
>> pmdp += pmd_index(addr);
>> @@ -314,21 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> (flags & NO_CONT_MAPPINGS) == 0)
>> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>
>> - init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> + ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> + if (ret)
>> + break;
>>
>> pmdp += pmd_index(next) - pmd_index(addr);
>> phys += next - addr;
>> } while (addr = next, addr != end);
>>
>> pmd_clear_fixmap();
>> +
>> + return ret;
>> }
>>
>> -static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> - phys_addr_t phys, pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int),
>> - int flags)
>> +static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> + phys_addr_t phys, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int),
>> + int flags)
>> {
>> unsigned long next;
>> + int ret = 0;
>> p4d_t p4d = READ_ONCE(*p4dp);
>> pud_t *pudp;
>>
>> @@ -340,6 +357,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> p4dval |= P4D_TABLE_PXN;
>> BUG_ON(!pgtable_alloc);
>> pud_phys = pgtable_alloc(PUD_SHIFT);
>> + if (!pud_phys)
>> + return -ENOMEM;
>> pudp = pud_set_fixmap(pud_phys);
>> init_clear_pgtable(pudp);
>> pudp += pud_index(addr);
>> @@ -369,8 +388,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
>> READ_ONCE(pud_val(*pudp))));
>> } else {
>> - alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>> + ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>> pgtable_alloc, flags);
>> + if (ret)
>> + break;
>>
>> BUG_ON(pud_val(old_pud) != 0 &&
>> pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
>> @@ -379,14 +400,17 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> } while (pudp++, addr = next, addr != end);
>>
>> pud_clear_fixmap();
>> +
>> + return ret;
>> }
>>
>> -static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> - phys_addr_t phys, pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int),
>> - int flags)
>> +static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> + phys_addr_t phys, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int),
>> + int flags)
>> {
>> unsigned long next;
>> + int ret = 0;
>> pgd_t pgd = READ_ONCE(*pgdp);
>> p4d_t *p4dp;
>>
>> @@ -398,6 +422,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> pgdval |= PGD_TABLE_PXN;
>> BUG_ON(!pgtable_alloc);
>> p4d_phys = pgtable_alloc(P4D_SHIFT);
>> + if (!p4d_phys)
>> + return -ENOMEM;
>> p4dp = p4d_set_fixmap(p4d_phys);
>> init_clear_pgtable(p4dp);
>> p4dp += p4d_index(addr);
>> @@ -412,8 +438,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>
>> next = p4d_addr_end(addr, end);
>>
>> - alloc_init_pud(p4dp, addr, next, phys, prot,
>> + ret = alloc_init_pud(p4dp, addr, next, phys, prot,
>> pgtable_alloc, flags);
>> + if (ret)
>> + break;
>>
>> BUG_ON(p4d_val(old_p4d) != 0 &&
>> p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
>> @@ -422,23 +450,26 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> } while (p4dp++, addr = next, addr != end);
>>
>> p4d_clear_fixmap();
>> +
>> + return ret;
>> }
>>
>> -static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> - unsigned long virt, phys_addr_t size,
>> - pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int),
>> - int flags)
>> +static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> + unsigned long virt, phys_addr_t size,
>> + pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int),
>> + int flags)
>> {
>> unsigned long addr, end, next;
>> pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>> + int ret = 0;
>>
>> /*
>> * If the virtual and physical address don't have the same offset
>> * within a page, we cannot map the region as the caller expects.
>> */
>> if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>> - return;
>> + return -EINVAL;
>>
>> phys &= PAGE_MASK;
>> addr = virt & PAGE_MASK;
>> @@ -446,29 +477,38 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>>
>> do {
>> next = pgd_addr_end(addr, end);
>> - alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>> + ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>> flags);
>> + if (ret)
>> + break;
>> phys += next - addr;
>> } while (pgdp++, addr = next, addr != end);
>> +
>> + return ret;
>> }
>>
>> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> - unsigned long virt, phys_addr_t size,
>> - pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int),
>> - int flags)
>> +static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> + unsigned long virt, phys_addr_t size,
>> + pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int),
>> + int flags)
>> {
>> + int ret;
>> +
>> mutex_lock(&fixmap_lock);
>> - __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>> + ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>> pgtable_alloc, flags);
>> + BUG_ON(ret);
> This function now returns an error, but also BUGs on ret!=0. For this patch, I'd
> suggest keeping this function as void.
You mean __create_pgd_mapping(), right?
> But I believe there is a pre-existing bug in arch_add_memory(). That's called at
> runtime so if __create_pgd_mapping() fails and BUGs, it will take down a running
> system.
Yes, it is the current behavior.
> With this foundational patch, we can fix that with an additional patch to pass
> along the error code instead of BUGing in that case. arch_add_memory() would
> need to unwind whatever __create_pgd_mapping() managed to do before the memory
> allocation failure (presumably unmapping and freeing any allocated tables). I'm
> happy to do this as a follow up patch.
Yes, the allocated page tables need to be freed. Thank you for taking it.
>> mutex_unlock(&fixmap_lock);
>> +
>> + return ret;
>> }
>>
>> #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>> extern __alias(__create_pgd_mapping_locked)
>> -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> - phys_addr_t size, pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(int), int flags);
>> +int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> + phys_addr_t size, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int), int flags);
> create_kpti_ng_temp_pgd() now returns error instead of BUGing on allocation
> failure, but I don't see a change to handle that error. You'll want to update
> __kpti_install_ng_mappings() to BUG on error.
Yes, I missed that. It should BUG on error.
>> #endif
>>
>> static phys_addr_t __pgd_pgtable_alloc(int shift)
>> @@ -476,13 +516,17 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
>> /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
>> void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
>>
>> - BUG_ON(!ptr);
>> + if (!ptr)
>> + return 0;
> 0 is a valid (though unlikely) physical address. I guess you could technically
> encode like ERR_PTR(), but since you are returning phys_addr_t and not a
> pointer, then perhaps it will be clearer to make this return int and accept a
> pointer to a phys_addr_t, which it will populate on success?
Actually I did something similar in the first place, but just returned
the virt address. Then did something if it returns NULL. That made the
code a little more messy since we need convert the virt address to phys
address because __create_pgd_mapping() and the helpers require phys
address, and changed the functions definition.
But I noticed 0 should be not a valid phys address if I remember
correctly. I also noticed early_pgtable_alloc() calls
memblock_phys_alloc_range(), it returns 0 on failure. If 0 is valid phys
address, then it should not do that, right? And I also noticed the
memblock range 0 - memstart_addr is actually removed from memblock (see
arm64_memblock_init()), so IIUC 0 should be not valid phys address. So
the patch ended up being as is.
If this assumption doesn't stand, I think your suggestion makes sense.
>> +
>> return __pa(ptr);
>> }
>>
>> static phys_addr_t pgd_pgtable_alloc(int shift)
>> {
>> phys_addr_t pa = __pgd_pgtable_alloc(shift);
>> + if (!pa)
>> + goto out;
> This would obviously need to be fixed up as per above.
>
>> struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
>>
>> /*
>> @@ -498,6 +542,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>> else if (shift == PMD_SHIFT)
>> BUG_ON(!pagetable_pmd_ctor(ptdesc));
>>
>> +out:
>> return pa;
>> }
>>
> You have left early_pgtable_alloc() to panic() on allocation failure. Given we
> can now unwind the stack with error code, I think it would be more consistent to
> also allow early_pgtable_alloc() to return error.
The early_pgtable_alloc() is just used for painting linear mapping at
early boot stage, if it fails I don't think unwinding the stack is
feasible and worth it. Did I miss something?
Thanks,
Yang
> Thanks,
> Ryan
>
Powered by blists - more mailing lists