lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 27 May 2019 12:21:23 +0200
From:   David Hildenbrand <david@...hat.com>
To:     Anshuman Khandual <anshuman.khandual@....com>,
        linux-kernel@...r.kernel.org, linux-arm-kernel@...ts.infradead.org,
        akpm@...ux-foundation.org, catalin.marinas@....com,
        will.deacon@....com
Cc:     mark.rutland@....com, mhocko@...e.com, ira.weiny@...el.com,
        cai@....pw, logang@...tatee.com, james.morse@....com,
        cpandya@...eaurora.org, arunks@...eaurora.org,
        dan.j.williams@...el.com, mgorman@...hsingularity.net,
        osalvador@...e.de, ard.biesheuvel@....com
Subject: Re: [PATCH V4 4/4] arm64/mm: Enable memory hot remove

On 27.05.19 10:09, Anshuman Khandual wrote:
> 
> 
> On 05/21/2019 03:50 PM, David Hildenbrand wrote:
>> On 20.05.19 07:18, Anshuman Khandual wrote:
>>> The arch code for hot-remove must tear down portions of the linear map and
>>> vmemmap corresponding to memory being removed. In both cases the page
>>> tables mapping these regions must be freed, and when sparse vmemmap is in
>>> use the memory backing the vmemmap must also be freed.
>>>
>>> This patch adds a new remove_pagetable() helper which can be used to tear
>>> down either region, and calls it from vmemmap_free() and
>>> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
>>> backing memory will be freed.
>>>
>>> While freeing intermediate level page table pages bail out if any of it's
>>> entries are still valid. This can happen for partially filled kernel page
>>> table either from a previously attempted failed memory hot add or while
>>> removing an address range which does not span the entire page table page
>>> range.
>>>
>>> The vmemmap region may share levels of table with the vmalloc region. Take
>>> the kernel ptl so that we can safely free potentially-shared tables.
>>>
>>> While here update arch_add_memory() to handle __add_pages() failures by
>>> just unmapping recently added kernel linear mapping. Now enable memory hot
>>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>>
>>> This implementation is overall inspired from kernel page table tear down
>>> procedure on X86 architecture.
>>>
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@....com>
>>> ---
>>>  arch/arm64/Kconfig  |   3 +
>>>  arch/arm64/mm/mmu.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>  2 files changed, 213 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 4780eb7..ce24427 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
>>>  config ARCH_ENABLE_MEMORY_HOTPLUG
>>>  	def_bool y
>>>  
>>> +config ARCH_ENABLE_MEMORY_HOTREMOVE
>>> +	def_bool y
>>> +
>>>  config SMP
>>>  	def_bool y
>>>  
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index a1bfc44..0cf0d41 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -733,6 +733,187 @@ int kern_addr_valid(unsigned long addr)
>>>  
>>>  	return pfn_valid(pte_pfn(pte));
>>>  }
>>> +
>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>> +static void free_hotplug_page_range(struct page *page, ssize_t size)
>>> +{
>>> +	WARN_ON(PageReserved(page));
>>> +	free_pages((unsigned long)page_address(page), get_order(size));
>>> +}
>>> +
>>> +static void free_hotplug_pgtable_page(struct page *page)
>>> +{
>>> +	free_hotplug_page_range(page, PAGE_SIZE);
>>> +}
>>> +
>>> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
>>> +{
>>> +	struct page *page;
>>> +	int i;
>>> +
>>> +	for (i = 0; i < PTRS_PER_PTE; i++) {
>>> +		if (!pte_none(ptep[i]))
>>> +			return;
>>> +	}
>>> +
>>> +	page = pmd_page(READ_ONCE(*pmdp));
>>> +	pmd_clear(pmdp);
>>> +	__flush_tlb_kernel_pgtable(addr);
>>> +	free_hotplug_pgtable_page(page);
>>> +}
>>> +
>>> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
>>> +{
>>> +	struct page *page;
>>> +	int i;
>>> +
>>> +	if (CONFIG_PGTABLE_LEVELS <= 2)
>>> +		return;
>>> +
>>> +	for (i = 0; i < PTRS_PER_PMD; i++) {
>>> +		if (!pmd_none(pmdp[i]))
>>> +			return;
>>> +	}
>>> +
>>> +	page = pud_page(READ_ONCE(*pudp));
>>> +	pud_clear(pudp);
>>> +	__flush_tlb_kernel_pgtable(addr);
>>> +	free_hotplug_pgtable_page(page);
>>> +}
>>> +
>>> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
>>> +{
>>> +	struct page *page;
>>> +	int i;
>>> +
>>> +	if (CONFIG_PGTABLE_LEVELS <= 3)
>>> +		return;
>>> +
>>> +	for (i = 0; i < PTRS_PER_PUD; i++) {
>>> +		if (!pud_none(pudp[i]))
>>> +			return;
>>> +	}
>>> +
>>> +	page = pgd_page(READ_ONCE(*pgdp));
>>> +	pgd_clear(pgdp);
>>> +	__flush_tlb_kernel_pgtable(addr);
>>> +	free_hotplug_pgtable_page(page);
>>> +}
>>> +
>>> +static void
>>> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
>>> +			unsigned long end, bool sparse_vmap)
>>> +{
>>> +	struct page *page;
>>> +	pte_t *ptep, pte;
>>> +	unsigned long start = addr;
>>> +
>>> +	for (; addr < end; addr += PAGE_SIZE) {
>>> +		ptep = pte_offset_kernel(pmdp, addr);
>>> +		pte = READ_ONCE(*ptep);
>>> +
>>> +		if (pte_none(pte))
>>> +			continue;
>>> +
>>> +		WARN_ON(!pte_present(pte));
>>> +		if (sparse_vmap) {
>>> +			page = pte_page(pte);
>>> +			free_hotplug_page_range(page, PAGE_SIZE);
>>> +		}
>>> +		pte_clear(&init_mm, addr, ptep);
>>> +	}
>>> +	flush_tlb_kernel_range(start, end);
>>> +}
>>> +
>>> +static void
>>> +remove_pmd_table(pud_t *pudp, unsigned long addr,
>>> +			unsigned long end, bool sparse_vmap)
>>> +{
>>> +	unsigned long next;
>>> +	struct page *page;
>>> +	pte_t *ptep_base;
>>> +	pmd_t *pmdp, pmd;
>>> +
>>> +	for (; addr < end; addr = next) {
>>> +		next = pmd_addr_end(addr, end);
>>> +		pmdp = pmd_offset(pudp, addr);
>>> +		pmd = READ_ONCE(*pmdp);
>>> +
>>> +		if (pmd_none(pmd))
>>> +			continue;
>>> +
>>> +		WARN_ON(!pmd_present(pmd));
>>> +		if (pmd_sect(pmd)) {
>>> +			if (sparse_vmap) {
>>> +				page = pmd_page(pmd);
>>> +				free_hotplug_page_range(page, PMD_SIZE);
>>> +			}
>>> +			pmd_clear(pmdp);
>>> +			continue;
>>> +		}
>>> +		ptep_base = pte_offset_kernel(pmdp, 0UL);
>>> +		remove_pte_table(pmdp, addr, next, sparse_vmap);
>>> +		free_pte_table(ptep_base, pmdp, addr);
>>> +	}
>>> +}
>>> +
>>> +static void
>>> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
>>> +			unsigned long end, bool sparse_vmap)
>>> +{
>>> +	unsigned long next;
>>> +	struct page *page;
>>> +	pmd_t *pmdp_base;
>>> +	pud_t *pudp, pud;
>>> +
>>> +	for (; addr < end; addr = next) {
>>> +		next = pud_addr_end(addr, end);
>>> +		pudp = pud_offset(pgdp, addr);
>>> +		pud = READ_ONCE(*pudp);
>>> +
>>> +		if (pud_none(pud))
>>> +			continue;
>>> +
>>> +		WARN_ON(!pud_present(pud));
>>> +		if (pud_sect(pud)) {
>>> +			if (sparse_vmap) {
>>> +				page = pud_page(pud);
>>> +				free_hotplug_page_range(page, PUD_SIZE);
>>> +			}
>>> +			pud_clear(pudp);
>>> +			continue;
>>> +		}
>>> +		pmdp_base = pmd_offset(pudp, 0UL);
>>> +		remove_pmd_table(pudp, addr, next, sparse_vmap);
>>> +		free_pmd_table(pmdp_base, pudp, addr);
>>> +	}
>>> +}
>>> +
>>> +static void
>>> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
>>> +{
>>> +	unsigned long addr, next;
>>> +	pud_t *pudp_base;
>>> +	pgd_t *pgdp, pgd;
>>> +
>>> +	spin_lock(&init_mm.page_table_lock);
>>> +	for (addr = start; addr < end; addr = next) {
>>> +		next = pgd_addr_end(addr, end);
>>> +		pgdp = pgd_offset_k(addr);
>>> +		pgd = READ_ONCE(*pgdp);
>>> +
>>> +		if (pgd_none(pgd))
>>> +			continue;
>>> +
>>> +		WARN_ON(!pgd_present(pgd));
>>> +		pudp_base = pud_offset(pgdp, 0UL);
>>> +		remove_pud_table(pgdp, addr, next, sparse_vmap);
>>> +		free_pud_table(pudp_base, pgdp, addr);
>>> +	}
>>> +	spin_unlock(&init_mm.page_table_lock);
>>> +}
>>> +#endif
>>> +
>>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
>>>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>> @@ -780,6 +961,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>>  void vmemmap_free(unsigned long start, unsigned long end,
>>>  		struct vmem_altmap *altmap)
>>>  {
>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>> +	remove_pagetable(start, end, true);
>>> +#endif
>>>  }
>>>  #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
>>>  
>>> @@ -1070,10 +1254,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>>>  }
>>>  
>>>  #ifdef CONFIG_MEMORY_HOTPLUG
>>> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
>>> +{
>>> +	WARN_ON(pgdir != init_mm.pgd);
>>> +	remove_pagetable(start, start + size, false);
>>> +}
>>> +
>>>  int arch_add_memory(int nid, u64 start, u64 size,
>>>  			struct mhp_restrictions *restrictions)
>>>  {
>>> -	int flags = 0;
>>> +	int ret, flags = 0;
>>>  
>>>  	if (rodata_full || debug_pagealloc_enabled())
>>>  		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>> @@ -1081,7 +1271,25 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>>  	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>>>  			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>>>  
>>> -	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>>> +	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>>>  			   restrictions);
>>> +	if (ret)
>>> +		__remove_pgd_mapping(swapper_pg_dir,
>>> +					__phys_to_virt(start), size);
>>
>> Nit: Indentation of the parameters looks really weird.
>>
>>> +	return ret;
>>> +}
>>> +
>>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>>> +void arch_remove_memory(int nid, u64 start, u64 size,
>>> +				struct vmem_altmap *altmap)
>>> +{
>>> +	unsigned long start_pfn = start >> PAGE_SHIFT;
>>> +	unsigned long nr_pages = size >> PAGE_SHIFT;
>>> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
>>> +
>>> +	__remove_pages(zone, start_pfn, nr_pages, altmap);
>>> +	__remove_pgd_mapping(swapper_pg_dir,
>>> +					__phys_to_virt(start), size);
>>
>> Dito, indentation of the parameters.
>>
>> For these two changes (arch_*_memory)
>>
>> Acked-by: David Hildenbrand <david@...hat.com>
> 
> Thanks David. The following change on this patch should fix the indentation
> problem. If there are no other comments I will incorporate this and re-spin
> the series once more.
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 0cf0d41..a87ba18 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1273,9 +1273,10 @@ int arch_add_memory(int nid, u64 start, u64 size,
>  
>  	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>  			   restrictions);
> -	if (ret)
> -		__remove_pgd_mapping(swapper_pg_dir,
> -					__phys_to_virt(start), size);
> +	if (!ret)
> +		return ret;
> +
> +	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);


you can leave the old code, just make sure the start of all parameters
on new lines are aligned, that seemed to be not the case

if (ret)
	__remove_pgd_mapping(swapper_pg_dir,
			     __phys_to_virt(start), size);
............................^

>  	return ret;
>  }
>  
> @@ -1288,8 +1289,7 @@ void arch_remove_memory(int nid, u64 start, u64 size,
>  	struct zone *zone = page_zone(pfn_to_page(start_pfn));
>  
>  	__remove_pages(zone, start_pfn, nr_pages, altmap);
> -	__remove_pgd_mapping(swapper_pg_dir,
> -					__phys_to_virt(start), size);
> +	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
>  }
>  #endif
>  #endif
> 


-- 

Thanks,

David / dhildenb

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ