[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87jzjndz95.fsf@all.your.base.are.belong.to.us>
Date: Tue, 21 May 2024 16:18:46 +0200
From: Björn Töpel <bjorn@...nel.org>
To: Alexandre Ghiti <alexghiti@...osinc.com>
Cc: Albert Ou <aou@...s.berkeley.edu>, David Hildenbrand <david@...hat.com>,
Palmer Dabbelt <palmer@...belt.com>, Paul Walmsley
<paul.walmsley@...ive.com>, linux-riscv@...ts.infradead.org, Oscar
Salvador <osalvador@...e.de>, Björn Töpel
<bjorn@...osinc.com>, Andrew
Bresticker <abrestic@...osinc.com>, Chethan Seshadri
<Chethan.Seshadri@...alinasystems.io>, Lorenzo Stoakes
<lstoakes@...il.com>, Santosh Mamila <santosh.mamila@...alinasystems.io>,
Sivakumar Munnangi <siva.munnangi@...alinasystems.io>, Sunil V L
<sunilvl@...tanamicro.com>, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, virtualization@...ts.linux-foundation.org
Subject: Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support
Alexandre Ghiti <alexghiti@...osinc.com> writes:
> On Tue, May 21, 2024 at 1:49 PM Björn Töpel <bjorn@...nel.org> wrote:
>>
>> From: Björn Töpel <bjorn@...osinc.com>
>>
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>>
>> arch_add_memory()
>> This callback is responsible for adding the physical memory into the
>> direct map, and call into the memory hotplugging generic code via
>> __add_pages() that adds the corresponding struct page entries, and
>> updates the vmemmap mapping.
>>
>> arch_remove_memory()
>> This is the inverse of the callback above.
>>
>> vmemmap_free()
>> This function tears down the vmemmap mappings (if
>> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>> backing vmemmap pages. Note that for persistent memory, an
>> alternative allocator for the backing pages can be used; The
>> vmem_altmap. This means that when the backing pages are cleared,
>> extra care is needed so that the correct deallocation method is
>> used.
>>
>> arch_get_mappable_range()
>> This functions returns the PA range that the direct map can map.
>> Used by the MHP internals for sanity checks.
>>
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>>
>> Signed-off-by: Björn Töpel <bjorn@...osinc.com>
>> ---
>> arch/riscv/mm/init.c | 261 +++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 261 insertions(+)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..6693b742bf2f 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
>> }
>> }
>> #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
>> +{
>> + unsigned int nr_pages = 1 << order;
>> +
>> + /*
>> + * vmemmap/direct page tables can be reserved, if added at
>> + * boot.
>> + */
>> + if (PageReserved(page)) {
>> + __ClearPageReserved(page);
>
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?
Indeed! x86 copy pasta (which uses bootmem info page that RV doesn't).
>> + while (nr_pages--)
>> + free_reserved_page(page++);
>> + return;
>> + }
>> +
>> + free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> + pte_t *pte;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PTE; i++) {
>> + pte = pte_start + i;
>> + if (!pte_none(*pte))
>> + return;
>> + }
>> +
>> + free_pagetable(pmd_page(*pmd), 0);
>> + pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> + pmd_t *pmd;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PMD; i++) {
>> + pmd = pmd_start + i;
>> + if (!pmd_none(*pmd))
>> + return;
>> + }
>> +
>> + free_pagetable(pud_page(*pud), 0);
>> + pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> + pud_t *pud;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PUD; i++) {
>> + pud = pud_start + i;
>> + if (!pud_none(*pud))
>> + return;
>> + }
>> +
>> + free_pagetable(p4d_page(*p4d), 0);
>> + p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> + struct vmem_altmap *altmap)
>> +{
>> + if (altmap)
>> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> + else
>> + free_pagetable(page, get_order(size));
>> +}
>> +
>> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
>> + bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> + unsigned long next;
>> + pte_t *ptep, pte;
>> +
>> + for (; addr < end; addr = next) {
>> + next = (addr + PAGE_SIZE) & PAGE_MASK;
>
> Nit: use ALIGN() instead.
>
>> + if (next > end)
>> + next = end;
>> +
>> + ptep = pte_base + pte_index(addr);
>> + pte = READ_ONCE(*ptep);
>
> Nit: Use ptep_get()
>
>> +
>> + if (!pte_present(*ptep))
>> + continue;
>> +
>> + pte_clear(&init_mm, addr, ptep);
>> + if (is_vmemmap)
>> + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
>> + }
>> +}
>> +
>> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
>> + bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> + unsigned long next;
>> + pte_t *pte_base;
>> + pmd_t *pmdp, pmd;
>> +
>> + for (; addr < end; addr = next) {
>> + next = pmd_addr_end(addr, end);
>> + pmdp = pmd_base + pmd_index(addr);
>> + pmd = READ_ONCE(*pmdp);
>
> Nit: Use pmdp_get()
>
>> +
>> + if (!pmd_present(pmd))
>> + continue;
>> +
>> + if (pmd_leaf(pmd)) {
>> + pmd_clear(pmdp);
>> + if (is_vmemmap)
>> + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
>> + continue;
>> + }
>> +
>> + pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
>> + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
>> + free_pte_table(pte_base, pmdp);
>> + }
>> +}
>> +
>> +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
>> + bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> + unsigned long next;
>> + pud_t *pudp, pud;
>> + pmd_t *pmd_base;
>> +
>> + for (; addr < end; addr = next) {
>> + next = pud_addr_end(addr, end);
>> + pudp = pud_base + pud_index(addr);
>> + pud = READ_ONCE(*pudp);
>
> Nit: Use pudp_get()
>
>> +
>> + if (!pud_present(pud))
>> + continue;
>> +
>> + if (pud_leaf(pud)) {
>> + if (pgtable_l4_enabled) {
>> + pud_clear(pudp);
>> + if (is_vmemmap)
>> + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
>> + }
>> + continue;
>> + }
>> +
>> + pmd_base = pmd_offset(pudp, 0);
>> + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
>> +
>> + if (pgtable_l4_enabled)
>> + free_pmd_table(pmd_base, pudp);
>> + }
>> +}
>> +
>> +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
>> + bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> + unsigned long next;
>> + p4d_t *p4dp, p4d;
>> + pud_t *pud_base;
>> +
>> + for (; addr < end; addr = next) {
>> + next = p4d_addr_end(addr, end);
>> + p4dp = p4d_base + p4d_index(addr);
>> + p4d = READ_ONCE(*p4dp);
>
> Nit: Use p4dp_get()
..and I'll make sure to address these nits as well.
Thanks!
Björn
Powered by blists - more mailing lists