lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2b715836-b566-4a9e-b344-9401fa4c0feb@os.amperecomputing.com>
Date: Mon, 17 Mar 2025 10:53:09 -0700
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, will@...nel.org,
 catalin.marinas@....com, Miko.Lenczewski@....com,
 scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [v3 PATCH 3/6] arm64: mm: make __create_pgd_mapping() and helpers
 non-void



On 3/14/25 4:51 AM, Ryan Roberts wrote:
> On 04/03/2025 22:19, Yang Shi wrote:
>> The later patch will enhance __create_pgd_mapping() and related helpers
>> to split kernel linear mapping, it requires have return value.  So make
>> __create_pgd_mapping() and helpers non-void functions.
>>
>> And move the BUG_ON() out of page table alloc helper since failing
>> splitting kernel linear mapping is not fatal and can be handled by the
>> callers in the later patch.  Have BUG_ON() after
>> __create_pgd_mapping_locked() returns to keep the current callers behavior
>> intact.
>>
>> Suggested-by: Ryan Roberts<ryan.roberts@....com>
>> Signed-off-by: Yang Shi<yang@...amperecomputing.com>
>> ---
>>   arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
>>   1 file changed, 86 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index b4df5bc5b1b8..dccf0877285b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -189,11 +189,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   	} while (ptep++, addr += PAGE_SIZE, addr != end);
>>   }
>>   
>> -static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> -				unsigned long end, phys_addr_t phys,
>> -				pgprot_t prot,
>> -				phys_addr_t (*pgtable_alloc)(int),
>> -				int flags)
>> +static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> +			       unsigned long end, phys_addr_t phys,
>> +			       pgprot_t prot,
>> +			       phys_addr_t (*pgtable_alloc)(int),
>> +			       int flags)
>>   {
>>   	unsigned long next;
>>   	pmd_t pmd = READ_ONCE(*pmdp);
>> @@ -208,6 +208,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   			pmdval |= PMD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +		if (!pte_phys)
>> +			return -ENOMEM;
> nit: personally I'd prefer to see a "goto out" and funnel all to a single return
> statement. You do that in some functions (via loop break), but would be cleaner
> if consistent.
>
> If pgtable_alloc() is modified to return int (see my comment at the bottom),
> this becomes:
>
> ret = pgtable_alloc(PAGE_SHIFT, &pte_phys);
> if (ret)
> 	goto out;

OK

>>   		ptep = pte_set_fixmap(pte_phys);
>>   		init_clear_pgtable(ptep);
>>   		ptep += pte_index(addr);
>> @@ -239,13 +241,16 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   	 * walker.
>>   	 */
>>   	pte_clear_fixmap();
>> +
>> +	return 0;
>>   }
>>   
>> -static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> -		     phys_addr_t phys, pgprot_t prot,
>> -		     phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> +		    phys_addr_t phys, pgprot_t prot,
>> +		    phys_addr_t (*pgtable_alloc)(int), int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   
>>   	do {
>>   		pmd_t old_pmd = READ_ONCE(*pmdp);
>> @@ -264,22 +269,27 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
>>   						      READ_ONCE(pmd_val(*pmdp))));
>>   		} else {
>> -			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>> +			ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>>   					    pgtable_alloc, flags);
>> +			if (ret)
>> +				break;
>>   
>>   			BUG_ON(pmd_val(old_pmd) != 0 &&
>>   			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
>>   		}
>>   		phys += next - addr;
>>   	} while (pmdp++, addr = next, addr != end);
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> -				unsigned long end, phys_addr_t phys,
>> -				pgprot_t prot,
>> -				phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> +			       unsigned long end, phys_addr_t phys,
>> +			       pgprot_t prot,
>> +			       phys_addr_t (*pgtable_alloc)(int), int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	pud_t pud = READ_ONCE(*pudp);
>>   	pmd_t *pmdp;
>>   
>> @@ -295,6 +305,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   			pudval |= PUD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +		if (!pmd_phys)
>> +			return -ENOMEM;
>>   		pmdp = pmd_set_fixmap(pmd_phys);
>>   		init_clear_pgtable(pmdp);
>>   		pmdp += pmd_index(addr);
>> @@ -314,21 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   		    (flags & NO_CONT_MAPPINGS) == 0)
>>   			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> -		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> +		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> +		if (ret)
>> +			break;
>>   
>>   		pmdp += pmd_index(next) - pmd_index(addr);
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>>   
>>   	pmd_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> -			   phys_addr_t phys, pgprot_t prot,
>> -			   phys_addr_t (*pgtable_alloc)(int),
>> -			   int flags)
>> +static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> +			  phys_addr_t phys, pgprot_t prot,
>> +			  phys_addr_t (*pgtable_alloc)(int),
>> +			  int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	p4d_t p4d = READ_ONCE(*p4dp);
>>   	pud_t *pudp;
>>   
>> @@ -340,6 +357,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   			p4dval |= P4D_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pud_phys = pgtable_alloc(PUD_SHIFT);
>> +		if (!pud_phys)
>> +			return -ENOMEM;
>>   		pudp = pud_set_fixmap(pud_phys);
>>   		init_clear_pgtable(pudp);
>>   		pudp += pud_index(addr);
>> @@ -369,8 +388,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
>>   						      READ_ONCE(pud_val(*pudp))));
>>   		} else {
>> -			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>> +			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>>   					    pgtable_alloc, flags);
>> +			if (ret)
>> +				break;
>>   
>>   			BUG_ON(pud_val(old_pud) != 0 &&
>>   			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
>> @@ -379,14 +400,17 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   	} while (pudp++, addr = next, addr != end);
>>   
>>   	pud_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> -			   phys_addr_t phys, pgprot_t prot,
>> -			   phys_addr_t (*pgtable_alloc)(int),
>> -			   int flags)
>> +static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> +			  phys_addr_t phys, pgprot_t prot,
>> +			  phys_addr_t (*pgtable_alloc)(int),
>> +			  int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	pgd_t pgd = READ_ONCE(*pgdp);
>>   	p4d_t *p4dp;
>>   
>> @@ -398,6 +422,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   			pgdval |= PGD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		p4d_phys = pgtable_alloc(P4D_SHIFT);
>> +		if (!p4d_phys)
>> +			return -ENOMEM;
>>   		p4dp = p4d_set_fixmap(p4d_phys);
>>   		init_clear_pgtable(p4dp);
>>   		p4dp += p4d_index(addr);
>> @@ -412,8 +438,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   
>>   		next = p4d_addr_end(addr, end);
>>   
>> -		alloc_init_pud(p4dp, addr, next, phys, prot,
>> +		ret = alloc_init_pud(p4dp, addr, next, phys, prot,
>>   			       pgtable_alloc, flags);
>> +		if (ret)
>> +			break;
>>   
>>   		BUG_ON(p4d_val(old_p4d) != 0 &&
>>   		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
>> @@ -422,23 +450,26 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   	} while (p4dp++, addr = next, addr != end);
>>   
>>   	p4d_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> -					unsigned long virt, phys_addr_t size,
>> -					pgprot_t prot,
>> -					phys_addr_t (*pgtable_alloc)(int),
>> -					int flags)
>> +static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> +				       unsigned long virt, phys_addr_t size,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>>   {
>>   	unsigned long addr, end, next;
>>   	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>> +	int ret = 0;
>>   
>>   	/*
>>   	 * If the virtual and physical address don't have the same offset
>>   	 * within a page, we cannot map the region as the caller expects.
>>   	 */
>>   	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>> -		return;
>> +		return -EINVAL;
>>   
>>   	phys &= PAGE_MASK;
>>   	addr = virt & PAGE_MASK;
>> @@ -446,29 +477,38 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>>   
>>   	do {
>>   		next = pgd_addr_end(addr, end);
>> -		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>> +		ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>>   			       flags);
>> +		if (ret)
>> +			break;
>>   		phys += next - addr;
>>   	} while (pgdp++, addr = next, addr != end);
>> +
>> +	return ret;
>>   }
>>   
>> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> -				 unsigned long virt, phys_addr_t size,
>> -				 pgprot_t prot,
>> -				 phys_addr_t (*pgtable_alloc)(int),
>> -				 int flags)
>> +static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> +				unsigned long virt, phys_addr_t size,
>> +				pgprot_t prot,
>> +				phys_addr_t (*pgtable_alloc)(int),
>> +				int flags)
>>   {
>> +	int ret;
>> +
>>   	mutex_lock(&fixmap_lock);
>> -	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>> +	ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>>   				    pgtable_alloc, flags);
>> +	BUG_ON(ret);
> This function now returns an error, but also BUGs on ret!=0. For this patch, I'd
> suggest keeping this function as void.

You mean __create_pgd_mapping(), right?

> But I believe there is a pre-existing bug in arch_add_memory(). That's called at
> runtime so if __create_pgd_mapping() fails and BUGs, it will take down a running
> system.

Yes, it is the current behavior.

> With this foundational patch, we can fix that with an additional patch to pass
> along the error code instead of BUGing in that case. arch_add_memory() would
> need to unwind whatever __create_pgd_mapping() managed to do before the memory
> allocation failure (presumably unmapping and freeing any allocated tables). I'm
> happy to do this as a follow up patch.

Yes, the allocated page tables need to be freed. Thank you for taking it.

>>   	mutex_unlock(&fixmap_lock);
>> +
>> +	return ret;
>>   }
>>   
>>   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>>   extern __alias(__create_pgd_mapping_locked)
>> -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> -			     phys_addr_t size, pgprot_t prot,
>> -			     phys_addr_t (*pgtable_alloc)(int), int flags);
>> +int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> +			    phys_addr_t size, pgprot_t prot,
>> +			    phys_addr_t (*pgtable_alloc)(int), int flags);
> create_kpti_ng_temp_pgd() now returns error instead of BUGing on allocation
> failure, but I don't see a change to handle that error. You'll want to update
> __kpti_install_ng_mappings() to BUG on error.

Yes, I missed that. It should BUG on error.

>>   #endif
>>   
>>   static phys_addr_t __pgd_pgtable_alloc(int shift)
>> @@ -476,13 +516,17 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
>>   	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
>>   	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
>>   
>> -	BUG_ON(!ptr);
>> +	if (!ptr)
>> +		return 0;
> 0 is a valid (though unlikely) physical address. I guess you could technically
> encode like ERR_PTR(), but since you are returning phys_addr_t and not a
> pointer, then perhaps it will be clearer to make this return int and accept a
> pointer to a phys_addr_t, which it will populate on success?

Actually I did something similar in the first place, but just returned 
the virt address. Then did something if it returns NULL. That made the 
code a little more messy since we need convert the virt address to phys 
address because __create_pgd_mapping() and the helpers require phys 
address, and changed the functions definition.

But I noticed 0 should be not a valid phys address if I remember 
correctly. I also noticed early_pgtable_alloc() calls 
memblock_phys_alloc_range(), it returns 0 on failure. If 0 is valid phys 
address, then it should not do that, right? And I also noticed the 
memblock range 0 - memstart_addr is actually removed from memblock (see 
arm64_memblock_init()), so IIUC 0 should be not valid phys address. So 
the patch ended up being as is.

If this assumption doesn't stand, I think your suggestion makes sense.

>> +
>>   	return __pa(ptr);
>>   }
>>   
>>   static phys_addr_t pgd_pgtable_alloc(int shift)
>>   {
>>   	phys_addr_t pa = __pgd_pgtable_alloc(shift);
>> +	if (!pa)
>> +		goto out;
> This would obviously need to be fixed up as per above.
>
>>   	struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
>>   
>>   	/*
>> @@ -498,6 +542,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>>   	else if (shift == PMD_SHIFT)
>>   		BUG_ON(!pagetable_pmd_ctor(ptdesc));
>>   
>> +out:
>>   	return pa;
>>   }
>>   
> You have left early_pgtable_alloc() to panic() on allocation failure. Given we
> can now unwind the stack with error code, I think it would be more consistent to
> also allow early_pgtable_alloc() to return error.

The early_pgtable_alloc() is just used for painting linear mapping at 
early boot stage, if it fails I don't think unwinding the stack is 
feasible and worth it. Did I miss something?

Thanks,
Yang

> Thanks,
> Ryan
>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ