lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d9960dbc-376d-4d33-9334-e4eb546b69ae@arm.com>
Date: Tue, 29 Jul 2025 18:04:26 +0530
From: Dev Jain <dev.jain@....com>
To: Yang Shi <yang@...amperecomputing.com>, ryan.roberts@....com,
 will@...nel.org, catalin.marinas@....com, akpm@...ux-foundation.org,
 Miko.Lenczewski@....com, scott@...amperecomputing.com, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 3/4] arm64: mm: support large block mapping when
 rodata=full


On 25/07/25 3:41 am, Yang Shi wrote:
> [----- snip -----]
>   
>   #ifdef CONFIG_ARM64_PAN
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 3d5fb37424ab..f63b39613571 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -480,6 +480,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>   			     int flags);
>   #endif
>   
> +#define INVALID_PHYS_ADDR	-1
> +
>   static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
>   				       enum pgtable_type pgtable_type)
>   {
> @@ -487,7 +489,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
>   	struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>   	phys_addr_t pa;
>   
> -	BUG_ON(!ptdesc);
> +	if (!ptdesc)
> +		return INVALID_PHYS_ADDR;
> +
>   	pa = page_to_phys(ptdesc_page(ptdesc));
>   
>   	switch (pgtable_type) {
> @@ -509,15 +513,29 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
>   }
>   
>   static phys_addr_t __maybe_unused
> -pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
> +split_pgtable_alloc(enum pgtable_type pgtable_type)
>   {
>   	return __pgd_pgtable_alloc(&init_mm, pgtable_type);
>   }
>   
> +static phys_addr_t __maybe_unused
> +pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
> +{
> +	phys_addr_t pa;
> +
> +	pa = __pgd_pgtable_alloc(&init_mm, pgtable_type);
> +	BUG_ON(pa == INVALID_PHYS_ADDR);
> +	return pa;
> +}
> +
>   static phys_addr_t
>   pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
>   {
> -	return __pgd_pgtable_alloc(NULL, pgtable_type);
> +	phys_addr_t pa;
> +
> +	pa = __pgd_pgtable_alloc(NULL, pgtable_type);
> +	BUG_ON(pa == INVALID_PHYS_ADDR);
> +	return pa;
>   }
>   
>   /*
> @@ -552,6 +570,254 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>   			     pgd_pgtable_alloc_special_mm, flags);
>   }
>   
> +static DEFINE_MUTEX(pgtable_split_lock);

Thanks for taking a separate lock.

> +
> +static int split_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end)
> +{
> +	pte_t *ptep;
> +	unsigned long next;
> +	unsigned int nr;
> +	unsigned long span;
> +
> +	ptep = pte_offset_kernel(pmdp, addr);
> +
> +	do {
> +		pte_t *_ptep;
> +
> +		nr = 0;
> +		next = pte_cont_addr_end(addr, end);
> +		if (next < end)
> +			nr = max(nr, ((end - next) / CONT_PTE_SIZE));
> +		span = nr * CONT_PTE_SIZE;
> +
> +		_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
> +		ptep += pte_index(next) - pte_index(addr) + nr * CONT_PTES;
> +
> +		if (((addr | next) & ~CONT_PTE_MASK) == 0)
> +			continue;
> +
> +		if (!pte_cont(__ptep_get(_ptep)))
> +			continue;
> +
> +		for (int i = 0; i < CONT_PTES; i++, _ptep++)
> +			__set_pte(_ptep, pte_mknoncont(__ptep_get(_ptep)));
> +	} while (addr = next + span, addr != end);
> +
> +	return 0;
> +}
> +
> +static int split_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end)
> +{
> +	unsigned long next;
> +	unsigned int nr;
> +	unsigned long span;
> +	int ret = 0;
> +
> +	do {
> +		pmd_t pmd;
> +
> +		nr = 1;
> +		next = pmd_addr_end(addr, end);
> +		if (next < end)
> +			nr = max(nr, ((end - next) / PMD_SIZE));
> +		span = (nr - 1) * PMD_SIZE;
> +
> +		if (((addr | next) & ~PMD_MASK) == 0)
> +			continue;
> +
> +		pmd = pmdp_get(pmdp);
> +		if (pmd_leaf(pmd)) {
> +			phys_addr_t pte_phys;
> +			pte_t *ptep;
> +			pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN |
> +					  PMD_TABLE_AF;
> +			unsigned long pfn = pmd_pfn(pmd);
> +			pgprot_t prot = pmd_pgprot(pmd);
> +
> +			pte_phys = split_pgtable_alloc(TABLE_PTE);
> +			if (pte_phys == INVALID_PHYS_ADDR)
> +				return -ENOMEM;
> +
> +			if (pgprot_val(prot) & PMD_SECT_PXN)
> +				pmdval |= PMD_TABLE_PXN;
> +
> +			prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) |
> +					PTE_TYPE_PAGE);
> +			prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> +			ptep = (pte_t *)phys_to_virt(pte_phys);
> +			for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
> +				__set_pte(ptep, pfn_pte(pfn, prot));
> +
> +			dsb(ishst);
> +
> +			__pmd_populate(pmdp, pte_phys, pmdval);
> +		}
> +
> +		ret = split_cont_pte(pmdp, addr, next);
> +		if (ret)
> +			break;
> +	} while (pmdp += nr, addr = next + span, addr != end);
> +
> +	return ret;
> +}
> +
> +static int split_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end)
> +{
> +	pmd_t *pmdp;
> +	unsigned long next;
> +	unsigned int nr;
> +	unsigned long span;
> +	int ret = 0;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +
> +	do {
> +		pmd_t *_pmdp;
> +
> +		nr = 0;
> +		next = pmd_cont_addr_end(addr, end);
> +		if (next < end)
> +			nr = max(nr, ((end - next) / CONT_PMD_SIZE));
> +		span = nr * CONT_PMD_SIZE;
> +
> +		if (((addr | next) & ~CONT_PMD_MASK) == 0) {
> +			pmdp += pmd_index(next) - pmd_index(addr) +
> +				nr * CONT_PMDS;
> +			continue;
> +		}
> +
> +		_pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
> +		if (!pmd_cont(pmdp_get(_pmdp)))
> +			goto split;
> +
> +		for (int i = 0; i < CONT_PMDS; i++, _pmdp++)
> +			set_pmd(_pmdp, pmd_mknoncont(pmdp_get(_pmdp)));
> +
> +split:
> +		ret = split_pmd(pmdp, addr, next);
> +		if (ret)
> +			break;
> +
> +		pmdp += pmd_index(next) - pmd_index(addr) + nr * CONT_PMDS;
> +	} while (addr = next + span, addr != end);
> +
> +	return ret;
> +}
> +
> +static int split_pud(p4d_t *p4dp, unsigned long addr, unsigned long end)
> +{
> +	pud_t *pudp;
> +	unsigned long next;
> +	unsigned int nr;
> +	unsigned long span;
> +	int ret = 0;
> +
> +	pudp = pud_offset(p4dp, addr);
> +
> +	do {
> +		pud_t pud;
> +
> +		nr = 1;
> +		next = pud_addr_end(addr, end);
> +		if (next < end)
> +			nr = max(nr, ((end - next) / PUD_SIZE));
> +		span = (nr - 1) * PUD_SIZE;
> +
> +		if (((addr | next) & ~PUD_MASK) == 0)
> +			continue;
> +
> +		pud = pudp_get(pudp);
> +		if (pud_leaf(pud)) {
> +			phys_addr_t pmd_phys;
> +			pmd_t *pmdp;
> +			pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN |
> +					  PUD_TABLE_AF;
> +			unsigned long pfn = pud_pfn(pud);
> +			pgprot_t prot = pud_pgprot(pud);
> +			unsigned int step = PMD_SIZE >> PAGE_SHIFT;
> +
> +			pmd_phys = split_pgtable_alloc(TABLE_PMD);
> +			if (pmd_phys == INVALID_PHYS_ADDR)
> +				return -ENOMEM;
> +
> +			if (pgprot_val(prot) & PMD_SECT_PXN)
> +				pudval |= PUD_TABLE_PXN;
> +
> +			prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) |
> +					PMD_TYPE_SECT);
> +			prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> +			pmdp = (pmd_t *)phys_to_virt(pmd_phys);
> +			for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
> +				set_pmd(pmdp, pfn_pmd(pfn, prot));
> +				pfn += step;
> +			}
> +
> +			dsb(ishst);
> +
> +			__pud_populate(pudp, pmd_phys, pudval);
> +		}
> +
> +		ret = split_cont_pmd(pudp, addr, next);
> +		if (ret)
> +			break;
> +	} while (pudp += nr, addr = next + span, addr != end);
> +
> +	return ret;
> +}
> +
> +static int split_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end)
> +{
> +	p4d_t *p4dp;
> +	unsigned long next;
> +	int ret = 0;
> +
> +	p4dp = p4d_offset(pgdp, addr);
> +
> +	do {
> +		next = p4d_addr_end(addr, end);
> +
> +		ret = split_pud(p4dp, addr, next);
> +		if (ret)
> +			break;
> +	} while (p4dp++, addr = next, addr != end);
> +
> +	return ret;
> +}
> +
> +static int split_pgd(pgd_t *pgdp, unsigned long addr, unsigned long end)
> +{
> +	unsigned long next;
> +	int ret = 0;
> +
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		ret = split_p4d(pgdp, addr, next);
> +		if (ret)
> +			break;
> +	} while (pgdp++, addr = next, addr != end);
> +
> +	return ret;
> +}
> +
> +int split_kernel_pgtable_mapping(unsigned long start, unsigned long end)
> +{
> +	int ret;
> +
> +	if (!system_supports_bbml2_noabort())
> +		return 0;
> +
> +	if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
> +		return -EINVAL;
> +
> +	mutex_lock(&pgtable_split_lock);
> +	arch_enter_lazy_mmu_mode();
> +	ret = split_pgd(pgd_offset_k(start), start, end);
> +	arch_leave_lazy_mmu_mode();
> +	mutex_unlock(&pgtable_split_lock);
> +
> +	return ret;
> +}
> +
>   	/*

--- snip ---

I'm afraid I'll have to agree with Ryan :) Looking at the signature of split_kernel_pgtable_mapping,
one would expect that this function splits all block mappings in this region. But that is just a
nit; it does not seem right to me that we are iterating over the entire space when we know *exactly* where
we have to make the split, just to save on pgd/p4d/pud loads - the effect of which is probably cancelled
out by unnecessary iterations your approach takes to skip the intermediate blocks.

If we are concerned that most change_memory_common() operations are for a single page, then
we can do something like:

unsigned long size = end - start;
bool end_split, start_split = false;

if (start not aligned to block mapping)
	start_split = split(start);

/*
  * split the end only if the start wasn't split, or
  * if it cannot be guaranteed that start and end lie
  * on the same contig block
  */
if (!start_split || (size > CONT_PTE_SIZE))
	end_split = split(end);




Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ