linux-kernel - Re: [PATCH] arm64: mm: fix direct map use over accounting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <241b6085-47c2-477b-954b-f03f7ef94138@os.amperecomputing.com>
Date: Thu, 20 Nov 2025 09:10:24 -0800
From: Yang Shi <yang@...amperecomputing.com>
To: Ryan Roberts <ryan.roberts@....com>, catalin.marinas@....com,
 will@...nel.org, cl@...two.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] arm64: mm: fix direct map use over accounting



On 11/20/25 7:11 AM, Ryan Roberts wrote:
> On 19/11/2025 23:57, Yang Shi wrote:
>> The create_pgd_mapping() families may be used in other paths other than
>> creating the linear map. And spliting kernel page table may be used for
>> non-linear map either. So we need check whether the address is linear
>> map address or not before accounting to direct map use. Encapsulate the
>> linear map address check in the new add/sub helpers. The new helpers are
>> nop if CONFIG_PROC_FS is not enabled.
>>
>> Signed-off-by: Yang Shi <yang@...amperecomputing.com>
>> ---
>>   arch/arm64/mm/mmu.c | 107 ++++++++++++++++++++++++++++----------------
>>   1 file changed, 69 insertions(+), 38 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e87a80aa4f3..4f3f0dc23faa 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -52,17 +52,6 @@
>>   
>>   DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
>>   
>> -enum direct_map_type {
>> -	PTE,
>> -	CONT_PTE,
>> -	PMD,
>> -	CONT_PMD,
>> -	PUD,
>> -	NR_DIRECT_MAP_TYPE,
>> -};
>> -
>> -static unsigned long direct_map_size[NR_DIRECT_MAP_TYPE];
>> -
>>   u64 kimage_voffset __ro_after_init;
>>   EXPORT_SYMBOL(kimage_voffset);
>>   
>> @@ -183,7 +172,18 @@ static void init_clear_pgtable(void *table)
>>   	dsb(ishst);
>>   }
>>   
>> +typedef enum direct_map_type {
>> +	PTE,
>> +	CONT_PTE,
>> +	PMD,
>> +	CONT_PMD,
>> +	PUD,
>> +	NR_DIRECT_MAP_TYPE,
>> +} direct_map_type_t;
> Why the typedef? I think the Linux coding standard prefers `enum
> direct_map_type` right?

I'm not aware of such preference. The simple grep "typedef enum" shows 
thousands of results.

Anyway I don't have strong preference.

>
>> +
>>   #ifdef CONFIG_PROC_FS
>> +static unsigned long direct_map_size[NR_DIRECT_MAP_TYPE];
> All of these symbols are getting long. How about we call it dm?
>
> enum dm_type;
> static unsigned long dm_meminfo[NR_DM_TYPE];
> dm_meminfo_add();
> dm_meminfo_sub();
>
> That's a bit more plesent to my eye, at least.

OK, fine to me.

>
>> +
>>   void arch_report_meminfo(struct seq_file *m)
>>   {
>>   	char *size[NR_DIRECT_MAP_TYPE];
>> @@ -220,6 +220,35 @@ void arch_report_meminfo(struct seq_file *m)
>>   		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>   			size[PUD], direct_map_size[PUD] >> 10);
>>   }
>> +
>> +static inline bool is_linear_map_addr(unsigned long addr)
> So we are calling it direct map everywhere else but linear map here?
>
> Perhaps is_dm_addr() ?

OK

>
>> +{
>> +	return (addr >= PAGE_OFFSET) && (addr < _PAGE_END(VA_BITS_MIN));
> I'm not sure if this is definitely correct? We have:
>
> #define DIRECT_MAP_PHYSMEM_END	__pa(PAGE_END - 1)
>
> Where PAGE_END is not _PAGE_END(VA_BITS_MIN) if KASAN is enabled. Not sure if we
> should be using PAGE_END here or not?

Thanks for catching this. Took a deeper look at it. It seems 
_PAGE_END(VA_BITS_MIN) may cover kasan shadow area with 52 bits VA IIUC 
(< 52 bits VA should be not affected). Currently kasan doesn't use 
create_pgd_mapping() families to create page table, and we don't split 
kasan shadow area either, so there should be no over accounting problem, 
but it is fragile. So it looks like PAGE_END should be used, which ends 
at KASAN_SHADOW_START.

>
>> +}
>> +
>> +static inline void direct_map_meminfo_add(unsigned long addr, unsigned long size,
>> +					  direct_map_type_t type)
>> +{
>> +	if (is_linear_map_addr(addr))
>> +		direct_map_size[type] += size;
>> +}
>> +
>> +static inline void direct_map_meminfo_sub(unsigned long addr, unsigned long size,
>> +					  direct_map_type_t type)
>> +{
>> +	if (is_linear_map_addr(addr))
>> +		direct_map_size[type] -= size;
>> +}
> Given these are non-atomic RMW operations, is there any risk of racing which
> would break accounting? i.e.
>
> THREAD A		THREAD B
>
> read
> 			read
> modify (+ N)
> write
> 			modify (+ M)
> 			write
>
> We would fail to account N here.
>
> I think this is probably safe for the split case; we have the split lock for
> that already. And I think for populating the initial linear map, that's single
> threaded during boot.

Yes, split and initial linear map are safe.

>
> But what about hot[un]plug? Is there anything stopping concurrent split with
> linear map population due to hotplug?

Yes. When hot adding memory, the memory can't be allocated until they 
are fully onlined; when hot removing memory, the memory has to be 
offlined first, then we can have linear map unmapped. The offline will 
guarantee there is no concurrent use or split.

But this reminds me I need to take care hot remove path which calls 
__remove_pgd_mapping(). Will fix it in the new version.

>
>
>> +#else
>> +static inline void direct_map_meminfo_add(unsigned long addr, unsigned long size,
>> +					  direct_map_type_t type)
>> +{
>> +}
>> +
>> +static inline void direct_map_meminfo_sub(unsigned long addr, unsigned long size,
>> +					  direct_map_type_t type)
>> +{
>> +}
>>   #endif
>>   
>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>> @@ -286,9 +315,9 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   		init_pte(ptep, addr, next, phys, __prot);
>>   
>>   		if (pgprot_val(__prot) & PTE_CONT)
>> -			direct_map_size[CONT_PTE] += next - addr;
>> +			direct_map_meminfo_add(addr, (next - addr), CONT_PTE);
>>   		else
>> -			direct_map_size[PTE] += next - addr;
>> +			direct_map_meminfo_add(addr, (next - addr), PTE);
>>   
>>   		ptep += pte_index(next) - pte_index(addr);
>>   		phys += next - addr;
>> @@ -326,9 +355,9 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   			 * to avoid double counting.
>>   			 */
>>   			if (pgprot_val(prot) & PTE_CONT)
>> -				direct_map_size[CONT_PMD] += next - addr;
>> +				direct_map_meminfo_add(addr, (next - addr), CONT_PMD);
>>   			else
>> -				direct_map_size[PMD] += next - addr;
>> +				direct_map_meminfo_add(addr, (next - addr), PMD);
>>   			/*
>>   			 * After the PMD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -435,7 +464,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pud_set_huge(pudp, phys, prot);
>>   
>> -			direct_map_size[PUD] += next - addr;
>> +			direct_map_meminfo_add(addr, (next - addr), PUD);
>>   			/*
>>   			 * After the PUD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -596,20 +625,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
>>   	return pa;
>>   }
>>   
>> -static void split_contpte(pte_t *ptep)
>> +static void split_contpte(unsigned long addr, pte_t *ptep)
>>   {
>>   	int i;
>>   
>> -	direct_map_size[CONT_PTE] -= CONT_PTE_SIZE;
>> +	direct_map_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>>   
>>   	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
>>   	for (i = 0; i < CONT_PTES; i++, ptep++)
>>   		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
>>   
>> -	direct_map_size[PTE] += CONT_PTE_SIZE;
>> +	direct_map_meminfo_add(addr, CONT_PTE_SIZE, PTE);
> We have a pattern here where memory disappears from one bucket then reappears in
> another. Userspace could sample in between the 2 operations. Is that ok? Nobody
> has told me what the use cases for this feature are yet (beyond "it might be
> useful for debug"). So I can't tell...

Isn't it a very common problem for other counters in /proc/meminfo? For 
example, inactive and active, the core mm can move folios between 
inactive lru and active lru. The /proc/meminfo never guarantees to show 
accurate number for any moment. It just shows current snapshot. So it is 
basically used to monitor the trend, then the users can find some 
correlation in the trend.

>
>
> Overall, my opinion is that this patch is not quite cooked yet. Personally I'd
> prefer to see the first half reverted then we can wait another cycle until this
> is ready and apply it all in one go.

No problem for me. Will work on the new version. Thanks for taking time 
to review it.

Yang

>
> Thanks,
> Ryan
>
>
>>   }
>>   
>> -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>> +static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
>>   	unsigned long pfn = pmd_pfn(pmd);
>> @@ -631,13 +661,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> -	direct_map_size[PMD] -= PMD_SIZE;
>> +	direct_map_meminfo_sub(addr, PMD_SIZE, PMD);
>>   	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>>   		__set_pte(ptep, pfn_pte(pfn, prot));
>>   	if (to_cont)
>> -		direct_map_size[CONT_PTE] += PMD_SIZE;
>> +		direct_map_meminfo_add(addr, PMD_SIZE, CONT_PTE);
>>   	else
>> -		direct_map_size[PTE] += PMD_SIZE;
>> +		direct_map_meminfo_add(addr, PMD_SIZE, PTE);
>>   
>>   	/*
>>   	 * Ensure the pte entries are visible to the table walker by the time
>> @@ -649,20 +679,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	return 0;
>>   }
>>   
>> -static void split_contpmd(pmd_t *pmdp)
>> +static void split_contpmd(unsigned long addr, pmd_t *pmdp)
>>   {
>>   	int i;
>>   
>> -	direct_map_size[CONT_PMD] -= CONT_PMD_SIZE;
>> +	direct_map_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>>   
>>   	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
>>   	for (i = 0; i < CONT_PMDS; i++, pmdp++)
>>   		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
>>   
>> -	direct_map_size[PMD] += CONT_PMD_SIZE;
>> +	direct_map_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>>   }
>>   
>> -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>> +static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>>   	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
>> @@ -685,13 +716,13 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> -	direct_map_size[PUD] -= PUD_SIZE;
>> +	direct_map_meminfo_sub(addr, PUD_SIZE, PUD);
>>   	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
>>   		set_pmd(pmdp, pfn_pmd(pfn, prot));
>>   	if (to_cont)
>> -		direct_map_size[CONT_PMD] += PUD_SIZE;
>> +		direct_map_meminfo_add(addr, PUD_SIZE, CONT_PMD);
>>   	else
>> -		direct_map_size[PMD] += PUD_SIZE;
>> +		direct_map_meminfo_add(addr, PUD_SIZE, PMD);
>>   
>>   	/*
>>   	 * Ensure the pmd entries are visible to the table walker by the time
>> @@ -746,7 +777,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pud_present(pud))
>>   		goto out;
>>   	if (pud_leaf(pud)) {
>> -		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -764,14 +795,14 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   		goto out;
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> +			split_contpmd(addr, pmdp);
>>   		/*
>>   		 * PMD: If addr is PMD aligned then addr already describes a
>>   		 * leaf boundary. Otherwise, split to contpte.
>>   		 */
>>   		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>>   			goto out;
>> -		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -788,7 +819,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pte_present(pte))
>>   		goto out;
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   out:
>>   	return ret;
>> @@ -875,7 +906,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
>>   	int ret = 0;
>>   
>>   	if (pud_leaf(pud))
>> -		ret = split_pud(pudp, pud, gfp, false);
>> +		ret = split_pud(addr, pudp, pud, gfp, false);
>>   
>>   	return ret;
>>   }
>> @@ -889,8 +920,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
>>   
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> -		ret = split_pmd(pmdp, pmd, gfp, false);
>> +			split_contpmd(addr, pmdp);
>> +		ret = split_pmd(addr, pmdp, pmd, gfp, false);
>>   
>>   		/*
>>   		 * We have split the pmd directly to ptes so there is no need to
>> @@ -908,7 +939,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
>>   	pte_t pte = __ptep_get(ptep);
>>   
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   	return 0;
>>   }