[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YzRsibv4Iqw2Kk0T@FVFF77S0Q05N>
Date: Wed, 28 Sep 2022 16:47:21 +0100
From: Mark Rutland <mark.rutland@....com>
To: Pierre Gondois <pierre.gondois@....com>
Cc: linux-kernel@...r.kernel.org,
Dietmar Eggemann <dietmar.eggemann@....com>,
Valentin Schneider <vschneid@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Will Deacon <will@...nel.org>,
linux-arm-kernel@...ts.infradead.org
Subject: Re: [PATCH v1 1/1] arm_pmu: acpi: Pre-allocate pmu structures
Hi Pierre,
Thanks for this, and sorry for the delayed reply.
On Mon, Sep 12, 2022 at 05:51:04PM +0200, Pierre Gondois wrote:
> On an Ampere Altra,
> Running a preemp_rt kernel based on v5.19-rc3-rt4 on an Ampere Altra
> triggers:
> [ 12.642389] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
> [ 12.642402] in_atomic(): 0, irqs_disabled(): 128, non_block: 0, pid: 24, name: cpuhp/0
> [ 12.642406] preempt_count: 0, expected: 0
> [ 12.642409] RCU nest depth: 0, expected: 0
> [ 12.642411] 3 locks held by cpuhp/0/24:
> [ 12.642414] #0: ffffd8a22c8870d0 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun (linux/kernel/cpu.c:754)
> [ 12.642429] #1: ffffd8a22c887120 (cpuhp_state-up){+.+.}-{0:0}, at: cpuhp_thread_fun (linux/kernel/cpu.c:754)
> [ 12.642436] #2: ffff083e7f0d97b8 ((&c->lock)){+.+.}-{3:3}, at: ___slab_alloc (linux/mm/slub.c:2954)
> [ 12.642458] irq event stamp: 42
> [ 12.642460] hardirqs last enabled at (41): finish_task_switch (linux/./arch/arm64/include/asm/irqflags.h:35)
> [ 12.642471] hardirqs last disabled at (42): cpuhp_thread_fun (linux/kernel/cpu.c:776 (discriminator 1))
> [ 12.642476] softirqs last enabled at (0): copy_process (linux/./include/linux/lockdep.h:191)
> [ 12.642484] softirqs last disabled at (0): 0x0
> [ 12.642495] CPU: 0 PID: 24 Comm: cpuhp/0 Tainted: G W 5.19.0-rc3-rt4-custom-piegon01-rt_0 #142
> [ 12.642500] Hardware name: WIWYNN Mt.Jade Server System B81.03001.0005/Mt.Jade Motherboard, BIOS 1.08.20220218 (SCP: 1.08.20220218) 2022/02/18
> [ 12.642506] Call trace:
> [ 12.642508] dump_backtrace (linux/arch/arm64/kernel/stacktrace.c:200)
> [ 12.642514] show_stack (linux/arch/arm64/kernel/stacktrace.c:207)
> [ 12.642517] dump_stack_lvl (linux/lib/dump_stack.c:107)
> [ 12.642523] dump_stack (linux/lib/dump_stack.c:114)
> [ 12.642527] __might_resched (linux/kernel/sched/core.c:9929)
> [ 12.642531] rt_spin_lock (linux/kernel/locking/rtmutex.c:1732 (discriminator 4))
> [ 12.642536] ___slab_alloc (linux/mm/slub.c:2954)
> [ 12.642539] __slab_alloc.isra.0 (linux/mm/slub.c:3116)
> [ 12.642543] kmem_cache_alloc_trace (linux/mm/slub.c:3207)
> [ 12.642549] __armpmu_alloc (linux/./include/linux/slab.h:600)
> [ 12.642558] armpmu_alloc_atomic (linux/drivers/perf/arm_pmu.c:927)
> [ 12.642562] arm_pmu_acpi_cpu_starting (linux/drivers/perf/arm_pmu_acpi.c:204)
> [ 12.642568] cpuhp_invoke_callback (linux/kernel/cpu.c:192)
> [ 12.642571] cpuhp_thread_fun (linux/kernel/cpu.c:777 (discriminator 3))
> [ 12.642573] smpboot_thread_fn (linux/kernel/smpboot.c:164 (discriminator 3))
> [ 12.642580] kthread (linux/kernel/kthread.c:376)
> [ 12.642584] ret_from_fork (linux/arch/arm64/kernel/entry.S:868)
>
> arm_pmu_acpi_cpu_starting() is called in the STARTING hotplug section,
> which runs with interrupts disabled. To avoid allocating memory and
> sleeping in this function, the pmu structures must be pre-allocated.
>
> On ACPI systems, the count of PMUs is unknown until CPUs are
> hotplugged, cf:
> commit 0dc1a1851af1 ("arm_pmu: add armpmu_alloc_atomic()")
>
> At most #PMU_IRQs pmu structures will be used and thus need to be
> pre-allocated.
> In arm_pmu_acpi_cpu_starting() subcalls, after checking the cpuid,
> decide to use or re-use a pre-allocated pmu structure. Thus the
> pre-allocated pmu struct can be seen as a pool.
> When probing, search and free unused pmu structures.
I think in retrospect I was trying to be too clever with
arm_pmu_acpi_cpu_starting() handling boot-time CPUs and late hotplug, and we
can make this simpler by handling the boot-time probing synchronously within
arm_pmu_acpi_probe(), removing a bunch of state.
I had a go at that, and in testing (in a QEMU TCG VM) atop arm64/for-next/core,
that seems to work (even with a faked-up heterogenous config). I've pushed that
to my `arm_pmu/acpi/rework` branch at:
https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm_pmu/acpi/rework
git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm_pmu/acpi/rework
... does that work for you?
Thanks,
Mark.
>
> Platforms used to test this patch:
> - Juno-r2 (2 clusters: 2 big and 4 little CPUs) with 2 PMUs and
> one interrupt for each CPU
> - Ampere Altra with 1 PMU and one interrupt for the 160 CPUs
>
> Link: https://lore.kernel.org/all/20210810134127.1394269-2-valentin.schneider@arm.com
> Reported-by: Valentin Schneider <vschneid@...hat.com>
> Signed-off-by: Pierre Gondois <pierre.gondois@....com>
> ---
> drivers/perf/arm_pmu.c | 17 +-----
> drivers/perf/arm_pmu_acpi.c | 114 ++++++++++++++++++++++++++++++-----
> include/linux/perf/arm_pmu.h | 1 -
> 3 files changed, 103 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 59d3980b8ca2..731e793dfef5 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -861,16 +861,16 @@ static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
> &cpu_pmu->node);
> }
>
> -static struct arm_pmu *__armpmu_alloc(gfp_t flags)
> +struct arm_pmu *armpmu_alloc(void)
> {
> struct arm_pmu *pmu;
> int cpu;
>
> - pmu = kzalloc(sizeof(*pmu), flags);
> + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
> if (!pmu)
> goto out;
>
> - pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, flags);
> + pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL);
> if (!pmu->hw_events) {
> pr_info("failed to allocate per-cpu PMU data.\n");
> goto out_free_pmu;
> @@ -916,17 +916,6 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags)
> return NULL;
> }
>
> -struct arm_pmu *armpmu_alloc(void)
> -{
> - return __armpmu_alloc(GFP_KERNEL);
> -}
> -
> -struct arm_pmu *armpmu_alloc_atomic(void)
> -{
> - return __armpmu_alloc(GFP_ATOMIC);
> -}
> -
> -
> void armpmu_free(struct arm_pmu *pmu)
> {
> free_percpu(pmu->hw_events);
> diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
> index 96ffadd654ff..599d8be78950 100644
> --- a/drivers/perf/arm_pmu_acpi.c
> +++ b/drivers/perf/arm_pmu_acpi.c
> @@ -17,6 +17,8 @@
>
> static DEFINE_PER_CPU(struct arm_pmu *, probed_pmus);
> static DEFINE_PER_CPU(int, pmu_irqs);
> +static unsigned int preallocated_pmus_count;
> +static struct arm_pmu **preallocated_pmus;
>
> static int arm_pmu_acpi_register_irq(int cpu)
> {
> @@ -187,30 +189,108 @@ static int arm_pmu_acpi_parse_irqs(void)
> return err;
> }
>
> +/* Count the number of different PMU IRQs for the input PMU. */
> +static int count_pmu_irqs(struct arm_pmu *pmu)
> +{
> + struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
> + unsigned int num_irqs = 0;
> + int cpu, probed_cpu, irq;
> +
> + for_each_cpu(cpu, &pmu->supported_cpus) {
> + irq = per_cpu(hw_events->irq, cpu);
> + for_each_cpu(probed_cpu, &pmu->supported_cpus) {
> + if (irq == per_cpu(hw_events->irq, probed_cpu)) {
> + if (probed_cpu == cpu)
> + num_irqs++;
> + break;
> + }
> + }
> + }
> +
> + return num_irqs;
> +}
> +
> +/*
> + * Count the number of different PMU IRQs across all the PMUs of the system
> + * to get an upper bound of the number of struct arm_pmu to pre-allocate.
> + */
> +static int count_all_pmu_irqs(void)
> +{
> + unsigned int num_irqs = 0;
> + int cpu, probed_cpu, irq;
> +
> + for_each_possible_cpu(cpu) {
> + irq = per_cpu(pmu_irqs, cpu);
> + for_each_possible_cpu(probed_cpu) {
> + if (irq == per_cpu(pmu_irqs, probed_cpu)) {
> + if (probed_cpu == cpu)
> + num_irqs++;
> + break;
> + }
> + }
> + }
> +
> + return num_irqs;
> +}
> +
> +static unsigned int pmu_preallocate(void)
> +{
> + unsigned int i, num_irqs = count_all_pmu_irqs();
> +
> + preallocated_pmus = kcalloc(num_irqs, sizeof(*preallocated_pmus),
> + GFP_KERNEL);
> + if (!preallocated_pmus) {
> + pr_err("Failed to pre-allocate %d pmu struct\n", num_irqs);
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < num_irqs; i++) {
> + preallocated_pmus[i] = armpmu_alloc();
> + if (!preallocated_pmus[i])
> + return -ENOMEM;
> + }
> +
> + preallocated_pmus_count = num_irqs;
> + return 0;
> +}
> +
> static struct arm_pmu *arm_pmu_acpi_find_alloc_pmu(void)
> {
> unsigned long cpuid = read_cpuid_id();
> struct arm_pmu *pmu;
> - int cpu;
> + unsigned int i;
>
> - for_each_possible_cpu(cpu) {
> - pmu = per_cpu(probed_pmus, cpu);
> - if (!pmu || pmu->acpi_cpuid != cpuid)
> - continue;
> + for (i = 0; i < preallocated_pmus_count; i++) {
> + pmu = preallocated_pmus[i];
>
> - return pmu;
> + if (!pmu->acpi_cpuid) {
> + pmu->acpi_cpuid = cpuid;
> + return pmu;
> + } else if (pmu->acpi_cpuid == cpuid)
> + return pmu;
> }
>
> - pmu = armpmu_alloc_atomic();
> - if (!pmu) {
> - pr_warn("Unable to allocate PMU for CPU%d\n",
> - smp_processor_id());
> - return NULL;
> - }
> + pr_err("Unable to find pre-allocated PMU for CPU%d\n",
> + smp_processor_id());
> + return NULL;
> +}
>
> - pmu->acpi_cpuid = cpuid;
> +static void pmu_free_unused_preallocated(struct arm_pmu *pmu)
> +{
> + int i, unused_num_irqs = count_pmu_irqs(pmu);
>
> - return pmu;
> + if (unused_num_irqs <= 1)
> + return;
> + else if (unused_num_irqs >= preallocated_pmus_count) {
> + pr_err("Trying to free %d pmu struct when %d are allocated\n",
> + unused_num_irqs, preallocated_pmus_count);
> + return;
> + }
> +
> + unused_num_irqs--;
> + for (i = 0; i < unused_num_irqs; i++)
> + armpmu_free(preallocated_pmus[preallocated_pmus_count - i - 1]);
> + preallocated_pmus_count -= unused_num_irqs;
> }
>
> /*
> @@ -311,6 +391,8 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn)
> if (!pmu || pmu->name)
> continue;
>
> + pmu_free_unused_preallocated(pmu);
> +
> ret = init_fn(pmu);
> if (ret == -ENODEV) {
> /* PMU not handled by this driver, or not present */
> @@ -351,6 +433,10 @@ static int arm_pmu_acpi_init(void)
> if (ret)
> return ret;
>
> + ret = pmu_preallocate();
> + if (ret)
> + return ret;
> +
> ret = cpuhp_setup_state(CPUHP_AP_PERF_ARM_ACPI_STARTING,
> "perf/arm/pmu_acpi:starting",
> arm_pmu_acpi_cpu_starting, NULL);
> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
> index 0407a38b470a..049908af3595 100644
> --- a/include/linux/perf/arm_pmu.h
> +++ b/include/linux/perf/arm_pmu.h
> @@ -173,7 +173,6 @@ void kvm_host_pmu_init(struct arm_pmu *pmu);
>
> /* Internal functions only for core arm_pmu code */
> struct arm_pmu *armpmu_alloc(void);
> -struct arm_pmu *armpmu_alloc_atomic(void);
> void armpmu_free(struct arm_pmu *pmu);
> int armpmu_register(struct arm_pmu *pmu);
> int armpmu_request_irq(int irq, int cpu);
> --
> 2.25.1
>
Powered by blists - more mailing lists