lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aBFbCP9TqNN0bGpB@harry>
Date: Wed, 30 Apr 2025 08:04:40 +0900
From: Harry Yoo <harry.yoo@...cle.com>
To: Alexander Gordeev <agordeev@...ux.ibm.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
        Andrey Ryabinin <ryabinin.a.a@...il.com>,
        Daniel Axtens <dja@...ens.net>, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, kasan-dev@...glegroups.com,
        linux-s390@...r.kernel.org, stable@...r.kernel.org
Subject: Re: [PATCH v3 1/1] kasan: Avoid sleepable page allocation from
 atomic context

On Tue, Apr 29, 2025 at 06:08:41PM +0200, Alexander Gordeev wrote:
> apply_to_pte_range() enters the lazy MMU mode and then invokes
> kasan_populate_vmalloc_pte() callback on each page table walk
> iteration. However, the callback can go into sleep when trying
> to allocate a single page, e.g. if an architecutre disables
> preemption on lazy MMU mode enter.

Should we add a comment that pte_fn_t must not sleep in
apply_to_pte_range()?

> On s390 if make arch_enter_lazy_mmu_mode() -> preempt_enable()
> and arch_leave_lazy_mmu_mode() -> preempt_disable(), such crash
> occurs:
> 
>     [  553.332108] preempt_count: 1, expected: 0
>     [  553.332117] no locks held by multipathd/2116.
>     [  553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted:
>     [  553.332139] Hardware name: IBM 3931 A01 701 (LPAR)
>     [  553.332146] Call Trace:
>     [  553.332152]  [<00000000158de23a>] dump_stack_lvl+0xfa/0x150
>     [  553.332167]  [<0000000013e10d12>] __might_resched+0x57a/0x5e8
>     [  553.332178]  [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0
>     [  553.332189]  [<00000000144d5cdc>] __get_free_pages+0x2c/0x88
>     [  553.332198]  [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110
>     [  553.332207]  [<000000001447625c>] apply_to_pte_range+0x164/0x3c8
>     [  553.332218]  [<000000001448125a>] apply_to_pmd_range+0xda/0x318
>     [  553.332226]  [<000000001448181c>] __apply_to_page_range+0x384/0x768
>     [  553.332233]  [<0000000014481c28>] apply_to_page_range+0x28/0x38
>     [  553.332241]  [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98
>     [  553.332249]  [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90
>     [  553.332257]  [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260
>     [  553.332265]  [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360
>     [  553.332274]  [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378
>     [  553.332284]  [<0000000013d62726>] dup_task_struct+0x66/0x430
>     [  553.332293]  [<0000000013d63962>] copy_process+0x432/0x4b80
>     [  553.332302]  [<0000000013d68300>] kernel_clone+0xf0/0x7d0
>     [  553.332311]  [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8
>     [  553.332400]  [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118
>     [  553.332410]  [<0000000013c9d34c>] do_syscall+0x22c/0x328
>     [  553.332419]  [<00000000158e7366>] __do_syscall+0xce/0xf0
>     [  553.332428]  [<0000000015913260>] system_call+0x70/0x98
> 
> Instead of allocating single pages per-PTE, bulk-allocate the
> shadow memory prior to applying kasan_populate_vmalloc_pte()
> callback on a page range.
>
> Suggested-by: Andrey Ryabinin <ryabinin.a.a@...il.com>
> Cc: stable@...r.kernel.org
> Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
> 
> Signed-off-by: Alexander Gordeev <agordeev@...ux.ibm.com>
> ---
>  mm/kasan/shadow.c | 65 +++++++++++++++++++++++++++++++++++------------
>  1 file changed, 49 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index 88d1c9dcb507..ea9a06715a81 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -292,30 +292,65 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
>  {
>  }
>  
> +struct vmalloc_populate_data {
> +	unsigned long start;
> +	struct page **pages;
> +};
> +
>  static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
> -				      void *unused)
> +				      void *_data)
>  {
> -	unsigned long page;
> +	struct vmalloc_populate_data *data = _data;
> +	struct page *page;
> +	unsigned long pfn;
>  	pte_t pte;
>  
>  	if (likely(!pte_none(ptep_get(ptep))))
>  		return 0;
>  
> -	page = __get_free_page(GFP_KERNEL);
> -	if (!page)
> -		return -ENOMEM;
> -
> -	__memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
> -	pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
> +	page = data->pages[PFN_DOWN(addr - data->start)];
> +	pfn = page_to_pfn(page);
> +	__memset(pfn_to_virt(pfn), KASAN_VMALLOC_INVALID, PAGE_SIZE);
> +	pte = pfn_pte(pfn, PAGE_KERNEL);
>  
>  	spin_lock(&init_mm.page_table_lock);
> -	if (likely(pte_none(ptep_get(ptep)))) {
> +	if (likely(pte_none(ptep_get(ptep))))
>  		set_pte_at(&init_mm, addr, ptep, pte);
> -		page = 0;

With this patch, now if the pte is already set, the page is leaked?

Should we set data->pages[PFN_DOWN(addr - data->start)] = NULL 
and free non-null elements later in __kasan_populate_vmalloc()?

> -	}
>  	spin_unlock(&init_mm.page_table_lock);
> -	if (page)
> -		free_page(page);
> +
> +	return 0;
> +}
> +
> +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
> +{
> +	unsigned long nr_pages, nr_total = PFN_UP(end - start);
> +	struct vmalloc_populate_data data;
> +	int ret;
> +
> +	data.pages = (struct page **)__get_free_page(GFP_KERNEL);
> +	if (!data.pages)
> +		return -ENOMEM;
> +
> +	while (nr_total) {
> +		nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
> +		__memset(data.pages, 0, nr_pages * sizeof(data.pages[0]));
> +		if (nr_pages != alloc_pages_bulk(GFP_KERNEL, nr_pages, data.pages)) {

When the return value of alloc_pages_bulk() is less than nr_pages,
you still need to free pages in the array unless nr_pages is zero.

> +			free_page((unsigned long)data.pages);
> +			return -ENOMEM;
> +		}
> +
> +		data.start = start;
> +		ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
> +					  kasan_populate_vmalloc_pte, &data);
> +		if (ret)
> +			return ret;
> +
> +		start += nr_pages * PAGE_SIZE;
> +		nr_total -= nr_pages;
> +	}
> +
> +	free_page((unsigned long)data.pages);
> +
>  	return 0;
>  }
>  
> @@ -348,9 +383,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
>  	shadow_start = PAGE_ALIGN_DOWN(shadow_start);
>  	shadow_end = PAGE_ALIGN(shadow_end);
>  
> -	ret = apply_to_page_range(&init_mm, shadow_start,
> -				  shadow_end - shadow_start,
> -				  kasan_populate_vmalloc_pte, NULL);
> +	ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
>  	if (ret)
>  		return ret;
>  
> -- 
> 2.45.2
> 
> 

-- 
Cheers,
Harry / Hyeonggon

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ