lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sun, 24 Jan 2021 16:05:49 -0800 (PST)
From:   David Rientjes <rientjes@...gle.com>
To:     Muchun Song <songmuchun@...edance.com>
cc:     corbet@....net, mike.kravetz@...cle.com, tglx@...utronix.de,
        mingo@...hat.com, bp@...en8.de, x86@...nel.org, hpa@...or.com,
        dave.hansen@...ux.intel.com, luto@...nel.org,
        Peter Zijlstra <peterz@...radead.org>, viro@...iv.linux.org.uk,
        Andrew Morton <akpm@...ux-foundation.org>, paulmck@...nel.org,
        mchehab+huawei@...nel.org, pawan.kumar.gupta@...ux.intel.com,
        rdunlap@...radead.org, oneukum@...e.com, anshuman.khandual@....com,
        jroedel@...e.de, almasrymina@...gle.com,
        Matthew Wilcox <willy@...radead.org>, osalvador@...e.de,
        mhocko@...e.com, song.bao.hua@...ilicon.com, david@...hat.com,
        naoya.horiguchi@....com, duanxiongchun@...edance.com,
        linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH v13 05/12] mm: hugetlb: allocate the vmemmap pages
 associated with each HugeTLB page

On Sun, 17 Jan 2021, Muchun Song wrote:

> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index ce4be1fa93c2..3b146d5949f3 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched.h>
>  #include <linux/pgtable.h>
>  #include <linux/bootmem_info.h>
> +#include <linux/delay.h>
>  
>  #include <asm/dma.h>
>  #include <asm/pgalloc.h>
> @@ -40,7 +41,8 @@
>   * @remap_pte:		called for each non-empty PTE (lowest-level) entry.
>   * @reuse_page:		the page which is reused for the tail vmemmap pages.
>   * @reuse_addr:		the virtual address of the @reuse_page page.
> - * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
> + * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
> + *			or is mapped from.
>   */
>  struct vmemmap_remap_walk {
>  	void (*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk {
>  	struct list_head *vmemmap_pages;
>  };
>  
> +/* The gfp mask of allocating vmemmap page */
> +#define GFP_VMEMMAP_PAGE		\
> +	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE)
> +

This is unnecessary, just use the gfp mask directly in allocator.

>  static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  			      unsigned long end,
>  			      struct vmemmap_remap_walk *walk)
> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
>  	free_vmemmap_page_list(&vmemmap_pages);
>  }
>  
> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> +				struct vmemmap_remap_walk *walk)
> +{
> +	pgprot_t pgprot = PAGE_KERNEL;
> +	struct page *page;
> +	void *to;
> +
> +	BUG_ON(pte_page(*pte) != walk->reuse_page);
> +
> +	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> +	list_del(&page->lru);
> +	to = page_to_virt(page);
> +	copy_page(to, (void *)walk->reuse_addr);
> +
> +	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +}
> +
> +static void alloc_vmemmap_page_list(struct list_head *list,
> +				    unsigned long start, unsigned long end)
> +{
> +	unsigned long addr;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE) {
> +		struct page *page;
> +		int nid = page_to_nid((const void *)addr);
> +
> +retry:
> +		page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0);
> +		if (unlikely(!page)) {
> +			msleep(100);
> +			/*
> +			 * We should retry infinitely, because we cannot
> +			 * handle allocation failures. Once we allocate
> +			 * vmemmap pages successfully, then we can free
> +			 * a HugeTLB page.
> +			 */
> +			goto retry;

Ugh, I don't think this will work, there's no guarantee that we'll ever 
succeed and now we can't free a 2MB hugepage because we cannot allocate a 
4KB page.  We absolutely have to ensure we make forward progress here.

We're going to be freeing the hugetlb page after this succeeeds, can we 
not use part of the hugetlb page that we're freeing for this memory 
instead?

> +		}
> +		list_add_tail(&page->lru, list);
> +	}
> +}
> +
> +/**
> + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
> + *			 to the page which is from the @vmemmap_pages
> + *			 respectively.
> + * @start:	start address of the vmemmap virtual address range.
> + * @end:	end address of the vmemmap virtual address range.
> + * @reuse:	reuse address.
> + */
> +void vmemmap_remap_alloc(unsigned long start, unsigned long end,
> +			 unsigned long reuse)
> +{
> +	LIST_HEAD(vmemmap_pages);
> +	struct vmemmap_remap_walk walk = {
> +		.remap_pte	= vmemmap_restore_pte,
> +		.reuse_addr	= reuse,
> +		.vmemmap_pages	= &vmemmap_pages,
> +	};
> +
> +	might_sleep();
> +
> +	/* See the comment in the vmemmap_remap_free(). */
> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	alloc_vmemmap_page_list(&vmemmap_pages, start, end);
> +	vmemmap_remap_range(reuse, end, &walk);
> +}
> +
>  /*
>   * Allocate a block of memory to be used to back the virtual memory map
>   * or to back the page tables that are used to create the mapping.
> -- 
> 2.11.0
> 
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ