[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZnkGps-vQbiynNwP@google.com>
Date: Sun, 23 Jun 2024 23:39:50 -0600
From: Yu Zhao <yuzhao@...gle.com>
To: Nanyong Sun <sunnanyong@...wei.com>
Cc: David Rientjes <rientjes@...gle.com>, Will Deacon <will@...nel.org>,
Catalin Marinas <catalin.marinas@....com>,
Matthew Wilcox <willy@...radead.org>, muchun.song@...ux.dev,
Andrew Morton <akpm@...ux-foundation.org>,
anshuman.khandual@....com, wangkefeng.wang@...wei.com,
linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, Yosry Ahmed <yosryahmed@...gle.com>,
Sourav Panda <souravpanda@...gle.com>
Subject: Re: [PATCH v3 0/3] A Solution to Re-enable hugetlb vmemmap optimize
On Mon, Mar 25, 2024 at 11:24:34PM +0800, Nanyong Sun wrote:
> On 2024/3/14 7:32, David Rientjes wrote:
>
> > On Thu, 8 Feb 2024, Will Deacon wrote:
> >
> > > > How about take a new lock with irq disabled during BBM, like:
> > > >
> > > > +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte)
> > > > +{
> > > > + (NEW_LOCK);
> > > > + pte_clear(&init_mm, addr, ptep);
> > > > + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> > > > + set_pte_at(&init_mm, addr, ptep, pte);
> > > > + spin_unlock_irq(NEW_LOCK);
> > > > +}
> > > I really think the only maintainable way to achieve this is to avoid the
> > > possibility of a fault altogether.
> > >
> > > Will
> > >
> > >
> > Nanyong, are you still actively working on making HVO possible on arm64?
> >
> > This would yield a substantial memory savings on hosts that are largely
> > configured with hugetlbfs. In our case, the size of this hugetlbfs pool
> > is actually never changed after boot, but it sounds from the thread that
> > there was an idea to make HVO conditional on FEAT_BBM. Is this being
> > pursued?
> >
> > If so, any testing help needed?
> I'm afraid that FEAT_BBM may not solve the problem here
I think so too -- I came cross this while working on TAO [1].
[1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@google.com/
> because from Arm
> ARM,
> I see that FEAT_BBM is only used for changing block size. Therefore, in this
> HVO feature,
> it can work in the split PMD stage, that is, BBM can be avoided in
> vmemmap_split_pmd,
> but in the subsequent vmemmap_remap_pte, the Output address of PTE still
> needs to be
> changed. I'm afraid FEAT_BBM is not competent for this stage. Perhaps my
> understanding
> of ARM FEAT_BBM is wrong, and I hope someone can correct me.
> Actually, the solution I first considered was to use the stop_machine
> method, but we have
> products that rely on /proc/sys/vm/nr_overcommit_hugepages to dynamically
> use hugepages,
> so I have to consider performance issues. If your product does not change
> the amount of huge
> pages after booting, using stop_machine() may be a feasible way.
> So far, I still haven't come up with a good solution.
I do have a patch that's similar to stop_machine() -- it uses NMI IPIs
to pause/resume remote CPUs while the local one is doing BBM.
Note that the problem of updating vmemmap for struct page[], as I see
it, is beyond hugeTLB HVO. I think it impacts virtio-mem and memory
hot removal in general [2]. On arm64, we would need to support BBM on
vmemmap so that we can fix the problem with offlining memory (or to be
precise, unmapping offlined struct page[]), by mapping offlined struct
page[] to a read-only page of dummy struct page[], similar to
ZERO_PAGE(). (Or we would have to make extremely invasive changes to
the reader side, i.e., all speculative PFN walkers.)
In case you are interested in testing my approach, you can swap your
patch 2 with the following:
[2] https://lore.kernel.org/20240621213717.1099079-1-yuzhao@google.com/
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 8ff5f2a2579e..1af1aa34a351 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -12,6 +12,7 @@
#include <asm/processor.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/cpu.h>
#define __HAVE_ARCH_PGD_FREE
#define __HAVE_ARCH_PUD_FREE
@@ -137,4 +138,58 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
__pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
}
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+
+#define vmemmap_update_lock vmemmap_update_lock
+static inline void vmemmap_update_lock(void)
+{
+ cpus_read_lock();
+}
+
+#define vmemmap_update_unlock vmemmap_update_unlock
+static inline void vmemmap_update_unlock(void)
+{
+ cpus_read_unlock();
+}
+
+#define vmemmap_update_pte vmemmap_update_pte
+static inline void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ preempt_disable();
+ pause_remote_cpus();
+
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ set_pte_at(&init_mm, addr, ptep, pte);
+
+ resume_remote_cpus();
+ preempt_enable();
+}
+
+#define vmemmap_update_pmd vmemmap_update_pmd
+static inline void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep)
+{
+ preempt_disable();
+ pause_remote_cpus();
+
+ pmd_clear(pmdp);
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ resume_remote_cpus();
+ preempt_enable();
+}
+
+#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all
+static inline void vmemmap_flush_tlb_all(void)
+{
+}
+
+#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range
+static inline void vmemmap_flush_tlb_range(unsigned long start, unsigned long end)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
+
#endif
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index efb13112b408..544b15948b64 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -144,6 +144,9 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+void pause_remote_cpus(void);
+void resume_remote_cpus(void);
+
#endif /* ifndef __ASSEMBLY__ */
#endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 31c8b3094dd7..ae0a178db066 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -71,16 +71,25 @@ enum ipi_msg_type {
IPI_RESCHEDULE,
IPI_CALL_FUNC,
IPI_CPU_STOP,
+ IPI_CPU_PAUSE,
+#ifdef CONFIG_KEXEC_CORE
IPI_CPU_CRASH_STOP,
+#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
IPI_TIMER,
+#endif
+#ifdef CONFIG_IRQ_WORK
IPI_IRQ_WORK,
+#endif
NR_IPI,
/*
* Any enum >= NR_IPI and < MAX_IPI is special and not tracable
* with trace_ipi_*
*/
IPI_CPU_BACKTRACE = NR_IPI,
+#ifdef CONFIG_KGDB
IPI_KGDB_ROUNDUP,
+#endif
MAX_IPI
};
@@ -771,9 +780,16 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
[IPI_RESCHEDULE] = "Rescheduling interrupts",
[IPI_CALL_FUNC] = "Function call interrupts",
[IPI_CPU_STOP] = "CPU stop interrupts",
+ [IPI_CPU_PAUSE] = "CPU pause interrupts",
+#ifdef CONFIG_KEXEC_CORE
[IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts",
+#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
[IPI_TIMER] = "Timer broadcast interrupts",
+#endif
+#ifdef CONFIG_IRQ_WORK
[IPI_IRQ_WORK] = "IRQ work interrupts",
+#endif
};
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr);
@@ -832,6 +848,85 @@ void __noreturn panic_smp_self_stop(void)
local_cpu_stop();
}
+static DEFINE_SPINLOCK(cpu_pause_lock);
+static cpumask_t paused_cpus;
+static cpumask_t resumed_cpus;
+
+static void pause_local_cpu(void)
+{
+ int cpu = smp_processor_id();
+
+ cpumask_clear_cpu(cpu, &resumed_cpus);
+ /*
+ * Paired with pause_remote_cpus() to confirm that this CPU not only
+ * will be paused but also can be reliably resumed.
+ */
+ smp_wmb();
+ cpumask_set_cpu(cpu, &paused_cpus);
+ /* A typical example for sleep and wake-up functions. */
+ smp_mb();
+ while (!cpumask_test_cpu(cpu, &resumed_cpus)) {
+ wfe();
+ barrier();
+ }
+ barrier();
+ cpumask_clear_cpu(cpu, &paused_cpus);
+}
+
+void pause_remote_cpus(void)
+{
+ cpumask_t cpus_to_pause;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_preemption_disabled();
+
+ cpumask_copy(&cpus_to_pause, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
+
+ spin_lock(&cpu_pause_lock);
+
+ WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
+
+ smp_cross_call(&cpus_to_pause, IPI_CPU_PAUSE);
+
+ while (!cpumask_equal(&cpus_to_pause, &paused_cpus)) {
+ cpu_relax();
+ barrier();
+ }
+ /*
+ * Paired pause_local_cpu() to confirm that all CPUs not only will be
+ * paused but also can be reliably resumed.
+ */
+ smp_rmb();
+ WARN_ON_ONCE(cpumask_intersects(&cpus_to_pause, &resumed_cpus));
+
+ spin_unlock(&cpu_pause_lock);
+}
+
+void resume_remote_cpus(void)
+{
+ cpumask_t cpus_to_resume;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_preemption_disabled();
+
+ cpumask_copy(&cpus_to_resume, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_to_resume);
+
+ spin_lock(&cpu_pause_lock);
+
+ cpumask_setall(&resumed_cpus);
+ /* A typical example for sleep and wake-up functions. */
+ smp_mb();
+ while (cpumask_intersects(&cpus_to_resume, &paused_cpus)) {
+ sev();
+ cpu_relax();
+ barrier();
+ }
+
+ spin_unlock(&cpu_pause_lock);
+}
+
#ifdef CONFIG_KEXEC_CORE
static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
#endif
@@ -911,6 +1006,11 @@ static void do_handle_IPI(int ipinr)
local_cpu_stop();
break;
+ case IPI_CPU_PAUSE:
+ pause_local_cpu();
+ break;
+
+#ifdef CONFIG_KEXEC_CORE
case IPI_CPU_CRASH_STOP:
if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
ipi_cpu_crash_stop(cpu, get_irq_regs());
@@ -918,6 +1018,7 @@ static void do_handle_IPI(int ipinr)
unreachable();
}
break;
+#endif
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
case IPI_TIMER:
@@ -939,9 +1040,11 @@ static void do_handle_IPI(int ipinr)
nmi_cpu_backtrace(get_irq_regs());
break;
+#ifdef CONFIG_KGDB
case IPI_KGDB_ROUNDUP:
kgdb_nmicallback(cpu, get_irq_regs());
break;
+#endif
default:
pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
@@ -971,9 +1074,14 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
switch (ipi) {
case IPI_CPU_STOP:
+ case IPI_CPU_PAUSE:
+#ifdef CONFIG_KEXEC_CORE
case IPI_CPU_CRASH_STOP:
+#endif
case IPI_CPU_BACKTRACE:
+#ifdef CONFIG_KGDB
case IPI_KGDB_ROUNDUP:
+#endif
return true;
default:
return false;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 5113753f3ac9..da6f2a7d665e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -46,6 +46,18 @@ struct vmemmap_remap_walk {
unsigned long flags;
};
+#ifndef vmemmap_update_lock
+static void vmemmap_update_lock(void)
+{
+}
+#endif
+
+#ifndef vmemmap_update_unlock
+static void vmemmap_update_unlock(void)
+{
+}
+#endif
+
#ifndef vmemmap_update_pmd
static inline void vmemmap_update_pmd(unsigned long addr,
pmd_t *pmdp, pte_t *ptep)
@@ -194,10 +206,12 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
VM_BUG_ON(!PAGE_ALIGNED(start | end));
+ vmemmap_update_lock();
mmap_read_lock(&init_mm);
ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
NULL, walk);
mmap_read_unlock(&init_mm);
+ vmemmap_update_unlock();
if (ret)
return ret;
Powered by blists - more mailing lists