[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aUPEjPvJdS/QxGyt@e129823.arm.com>
Date: Thu, 18 Dec 2025 09:08:28 +0000
From: Yeoreum Yun <yeoreum.yun@....com>
To: "David Hildenbrand (Red Hat)" <david@...nel.org>
Cc: catalin.marinas@....com, will@...nel.org, ryan.roberts@....com,
akpm@...ux-foundation.org, kevin.brodsky@....com,
quic_zhenhuah@...cinc.com, dev.jain@....com,
yang@...amperecomputing.com, chaitanyas.prakash@....com,
bigeasy@...utronix.de, clrkwllms@...nel.org, rostedt@...dmis.org,
lorenzo.stoakes@...cle.com, ardb@...nel.org, jackmanb@...gle.com,
vbabka@...e.cz, linux-arm-kernel@...ts.infradead.org,
linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev
Subject: Re: [PATCH v2 1/2] arm64: mmu: don't allocate page while spliting
linear mapping
Hi David,
> > Current linear_map_split_to_ptes() allocate the pagetable
> > while split linear mapping to ptes by stop_machine() with GFP_ATOMIC.
> >
> > This is fine for non-PREEMPR_RT case.
> > However It's a problem in PREEMPR_RT case since
> > generic memory allocation/free APIs (e.x) pgtable_alloc(), __get_free_pages and etc
> > couldn't be called in non-preemptible context except _nolock() APIs
> > since generic memory allocation/free APIs are *sleepable* for using *spin_lock()*
> >
> > IOW, calling a pgtable_alloc() even with GFP_ATOMIC
> > doesn't allow in __linear_map_split_to_pte() executed by stopper thread
> > where preemption is disabled in PREEMPR_RT.
> >
> > To address this, divide linear_map_maybe_split_ptes:
> > - collect number of pages to require for spliting.
>
> COLLECT step
>
> > - allocate the required number of pages for spliting.
>
> PREALLOC step
>
> > - with pre-allocate page, split the linear map.
>
> SPLIT step
>
> The steps you have below are confusing :)
>
> > +enum split_modes {
>
> Is it a "mode" or a "step" ?
>
> enum split_step {
> SPLIT_STEP_COLLECT,
> ...
> };
>
>
> But reading the code below I am lost how these modes apply here.
>
> I guess what you mean is something like
>
> enum split_mode {
> SPLIT_MODE_ALLOC, /* ... from the buddy. */
> SPLIT_MODE_PREALLOCATED, /* ... from preallocated buffer. */
> SPLIT_MODE_COLLECT, /* collect required page tables. */
> };
Sorry for *bad* naming from me.
Yes. your suggestion is much clearer. I'll change with this.
>
> But why is SPLIT_MODE_ALLOC vs. SPLIT_MODE_PREALLOCATED required?
>
> Can't we just test if there is something in the preallocated buffer, and if
> not, simply allocate?
>
> IOW, all you need is "collect vs. alloc", and the buffer information tells
> you how to allocate.
Agree. since it already check the "number of pgtable" before allocation
in linear_map_split_to_ptes(), That's would be better.
>
> > + SPLIT_ALLOC,
> > + SPLIT_PREALLOC,
> > + SPLIT_COLLECT,
> > +};
> > - pa = page_to_phys(ptdesc_page(ptdesc));
> > +struct split_args {
> > + enum split_modes mode;
> > + union {
> > + struct {
> > + unsigned long nr;
> > + unsigned long i;
> > + struct page **pages;
> > + };
> > + gfp_t gfp;
> > + };
> > +};
> > +static __always_inline void __pgd_pgtable_init(struct mm_struct *mm,
> > + struct ptdesc *ptdesc,
> > + enum pgtable_type pgtable_type)
> > +{
> > switch (pgtable_type) {
> > case TABLE_PTE:
> > BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
> > @@ -554,19 +564,56 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
> > break;
> > }
> > +}
> > +
> > +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
> > + enum pgtable_type pgtable_type)
> > +{
> > + /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
> > + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
> > + phys_addr_t pa;
> > +
> > + if (!ptdesc)
> > + return INVALID_PHYS_ADDR;
> > +
> > + pa = page_to_phys(ptdesc_page(ptdesc));
> > +
> > + __pgd_pgtable_init(mm, ptdesc, pgtable_type);
> > +
> > return pa;
> > }
> > +static phys_addr_t __pgd_pgtable_prealloc(struct mm_struct *mm,
> > + struct split_args *split_args,
> > + enum pgtable_type pgtable_type)
>
> The function is called "_prealloc" but actually it simply grabs a
> preallocated page. Confusing.
>
>
> Should this function be called something like
> "__pgd_pgtable_get_preallocated()". Not sure.
Sorry to make a confuse. I think this name would be better.
I'll fix.
> > +{
> > + struct page *page;
> > +
> > + BUG_ON(split_args->i >= split_args->nr);
>
> No BUG_ON().
Okay. as keep only two mode:
- COLLECT
- ALLOC
I'll remove this.
> > +
> > + page = split_args->pages[split_args->i++];
> > + if (!page)
> > + return INVALID_PHYS_ADDR;
> > +
> > + __pgd_pgtable_init(mm, page_ptdesc(page), pgtable_type);
> > +
> > + return page_to_phys(page);
> > +}
> > +
> > static phys_addr_t
> > -pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
> > +pgd_pgtable_alloc_split(enum pgtable_type pgtable_type,
> > + struct split_args *split_args)
> > {
> > - return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
> > + if (split_args->mode == SPLIT_ALLOC)
> > + return __pgd_pgtable_alloc(&init_mm, split_args->gfp, pgtable_type);
> > +
> > + return __pgd_pgtable_prealloc(&init_mm, split_args, pgtable_type);
> > }
> > static phys_addr_t __maybe_unused
> > pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
> > {
> > - return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
> > + return __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type);
> > }
> > static phys_addr_t
> > @@ -584,7 +631,9 @@ static void split_contpte(pte_t *ptep)
> > __set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
> > }
> > -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
> > +static int split_pmd(pmd_t *pmdp, pmd_t pmd,
> > + struct split_args *split_args,
> > + bool to_cont)
> > {
> > pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
> > unsigned long pfn = pmd_pfn(pmd);
> > @@ -593,7 +642,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
> > pte_t *ptep;
> > int i;
> > - pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
> > + pte_phys = pgd_pgtable_alloc_split(TABLE_PTE, split_args);
> > if (pte_phys == INVALID_PHYS_ADDR)
> > return -ENOMEM;
> > ptep = (pte_t *)phys_to_virt(pte_phys);
> > @@ -628,7 +677,9 @@ static void split_contpmd(pmd_t *pmdp)
> > set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
> > }
> > -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
> > +static int split_pud(pud_t *pudp, pud_t pud,
> > + struct split_args *split_args,
> > + bool to_cont)
> > {
> > pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
> > unsigned int step = PMD_SIZE >> PAGE_SHIFT;
> > @@ -638,7 +689,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
> > pmd_t *pmdp;
> > int i;
> > - pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
> > + pmd_phys = pgd_pgtable_alloc_split(TABLE_PMD, split_args);
> > if (pmd_phys == INVALID_PHYS_ADDR)
> > return -ENOMEM;
> > pmdp = (pmd_t *)phys_to_virt(pmd_phys);
> > @@ -672,6 +723,10 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
> > pmd_t *pmdp, pmd;
> > pte_t *ptep, pte;
> > int ret = 0;
> > + struct split_args split_args = {
> > + .mode = SPLIT_ALLOC,
> > + .gfp = GFP_PGTABLE_KERNEL,
> > + };
> > /*
> > * PGD: If addr is PGD aligned then addr already describes a leaf
> > @@ -707,7 +762,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
> > if (!pud_present(pud))
> > goto out;
> > if (pud_leaf(pud)) {
> > - ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
> > + ret = split_pud(pudp, pud, &split_args, true);
> > if (ret)
> > goto out;
> > }
> > @@ -732,7 +787,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
> > */
> > if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
> > goto out;
> > - ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
> > + ret = split_pmd(pmdp, pmd, &split_args, true);
> > if (ret)
> > goto out;
> > }
> > @@ -831,12 +886,17 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
> > static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> > unsigned long next, struct mm_walk *walk)
> > {
> > - gfp_t gfp = *(gfp_t *)walk->private;
> > + struct split_args *split_args = (struct split_args *)walk->private;
> > pud_t pud = pudp_get(pudp);
> > int ret = 0;
> > - if (pud_leaf(pud))
> > - ret = split_pud(pudp, pud, gfp, false);
> > + if (!pud_leaf(pud))
> > + return 0;
> > +
> > + if (split_args->mode == SPLIT_COLLECT)
> > + split_args->nr++;
> > + else
> > + ret = split_pud(pudp, pud, split_args, false);
> > return ret;
> > }
> > @@ -844,22 +904,29 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
> > static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
> > unsigned long next, struct mm_walk *walk)
> > {
> > - gfp_t gfp = *(gfp_t *)walk->private;
> > + struct split_args *split_args = (struct split_args *)walk->private;
> > pmd_t pmd = pmdp_get(pmdp);
> > - int ret = 0;
> > + int ret;
> > - if (pmd_leaf(pmd)) {
> > - if (pmd_cont(pmd))
> > - split_contpmd(pmdp);
> > - ret = split_pmd(pmdp, pmd, gfp, false);
> > + if (!pmd_leaf(pmd))
> > + return 0;
> > - /*
> > - * We have split the pmd directly to ptes so there is no need to
> > - * visit each pte to check if they are contpte.
> > - */
> > - walk->action = ACTION_CONTINUE;
> > + if (split_args->mode == SPLIT_COLLECT) {
> > + split_args->nr++;
> > + return 0;
> > }
> > + if (pmd_cont(pmd))
> > + split_contpmd(pmdp);
> > + ret = split_pmd(pmdp, pmd, split_args, false);
> > +
> > + /*
> > + * We have split the pmd directly to ptes so there is no need to
> > + * visit each pte to check if they are contpte.
> > + */
> > + walk->action = ACTION_CONTINUE;
> > +
> > +
> > return ret;
> > }
> > @@ -880,13 +947,15 @@ static const struct mm_walk_ops split_to_ptes_ops = {
> > .pte_entry = split_to_ptes_pte_entry,
> > };
> > -static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
> > +static int range_split_to_ptes(unsigned long start, unsigned long end,
> > + struct split_args *split_args)
> > {
> > int ret;
> > arch_enter_lazy_mmu_mode();
> > ret = walk_kernel_page_table_range_lockless(start, end,
> > - &split_to_ptes_ops, NULL, &gfp);
> > + &split_to_ptes_ops, NULL,
> > + split_args);
> > arch_leave_lazy_mmu_mode();
> > return ret;
> > @@ -903,7 +972,7 @@ static void __init init_idmap_kpti_bbml2_flag(void)
> > smp_mb();
> > }
> > -static int __init linear_map_split_to_ptes(void *__unused)
> > +static int __init linear_map_split_to_ptes(void *data)
> > {
> > /*
> > * Repainting the linear map must be done by CPU0 (the boot CPU) because
> > @@ -911,6 +980,7 @@ static int __init linear_map_split_to_ptes(void *__unused)
> > * be held in a waiting area with the idmap active.
> > */
> > if (!smp_processor_id()) {
> > + struct split_args *split_args = data;
> > unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> > unsigned long lend = PAGE_END;
> > unsigned long kstart = (unsigned long)lm_alias(_stext);
> > @@ -928,12 +998,13 @@ static int __init linear_map_split_to_ptes(void *__unused)
> > * PTE. The kernel alias remains static throughout runtime so
> > * can continue to be safely mapped with large mappings.
> > */
> > - ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
> > + ret = range_split_to_ptes(lstart, lend, split_args);
> > if (!ret)
> > - ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
> > + ret = range_split_to_ptes(kstart, kend, split_args);
> > if (ret)
> > panic("Failed to split linear map\n");
> > - flush_tlb_kernel_range(lstart, lend);
> > + if (split_args->mode != SPLIT_COLLECT)
> > + flush_tlb_kernel_range(lstart, lend);
> > /*
> > * Relies on dsb in flush_tlb_kernel_range() to avoid reordering
> > @@ -963,10 +1034,61 @@ static int __init linear_map_split_to_ptes(void *__unused)
> > void __init linear_map_maybe_split_to_ptes(void)
> > {
> > - if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) {
> > - init_idmap_kpti_bbml2_flag();
> > - stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
> > + struct page **pages = NULL;
> > + struct split_args split_args;
> > + int order;
> > + int nr_populated;
> > + int err;
> > +
> > + if (!linear_map_requires_bbml2 || system_supports_bbml2_noabort())
> > + return;
> > +
> > + /*
> > + * Phase 1: Collect required extra pages to split.
> > + */
> > + split_args.mode = SPLIT_COLLECT;
> > + split_args.nr = 0;
> > +
> > + init_idmap_kpti_bbml2_flag();
> > + stop_machine(linear_map_split_to_ptes, &split_args, cpu_online_mask);
> > +
> > + /*
> > + * Phase 2: Allocate necessary pages to split.
> > + */
> > + if (split_args.nr == 0) {
> > + err = 0;
> > + split_args.mode = SPLIT_ALLOC;
> > + } else {
> > + err = -ENOMEM;
> > + order = order_base_2(PAGE_ALIGN(split_args.nr *
> > + sizeof(struct page *)) >> PAGE_SHIFT);
> > +
> > + pages = (struct page **)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
> > + if (!pages)
> > + goto error;
> > +
> > + nr_populated = alloc_pages_bulk(GFP_ATOMIC | __GFP_ZERO, split_args.nr, pages);
> > + if (nr_populated < split_args.nr)
> > + goto error;
>
> That is prone to cause trouble in the future once we allocate memdescs
> separately.
>
> Can we simply call pagetable_alloc() in a loop here instead for the time
> being? A future-proof bulk allocation can be done later separately.
I miss the future plan.
Yes, I'll change to pagetable_alloc().
Thanks!
--
Sincerely,
Yeoreum Yun
Powered by blists - more mailing lists