linux-kernel - Re: [RFC PATCH] perf/smmuv3: Fix shared interrupt handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <72C1BA0A-E4D4-4740-860D-B343A836821E@amperemail.onmicrosoft.com>
Date:   Thu, 30 Apr 2020 15:11:03 -0700
From:   Tuan Phan <tuanphan@...eremail.onmicrosoft.com>
To:     Robin Murphy <robin.murphy@....com>
Cc:     Will Deacon <will@...nel.org>, Mark Rutland <mark.rutland@....com>,
        linux-arm-kernel@...ts.infradead.org,
        shameerali.kolothum.thodi@...wei.com, john.garry@...wei.com,
        harb@...erecomputing.com, tuanphan@...amperecomputing.com,
        linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH] perf/smmuv3: Fix shared interrupt handling

Hi Robin,

> On Apr 8, 2020, at 9:49 AM, Robin Murphy <robin.murphy@....com> wrote:
> 
> IRQF_SHARED is dangerous, since it allows other agents to retarget the
> IRQ's affinity without migrating PMU contexts to match, breaking the way
> in which perf manages mutual exclusion for accessing events. Although
> this means it's not realistically possible to support PMU IRQs being
> shared with other drivers, we *can* handle sharing between multiple PMU
> instances with some explicit affinity bookkeeping and manual interrupt
> multiplexing.
> 
> RCU helps us handle interrupts efficiently without having to worry about
> fine-grained locking for relatively-theoretical race conditions with the
> probe/remove/CPU hotplug slow paths. The resulting machinery ends up
> looking largely generic, so it should be feasible to factor out with a
> "system PMU" base class for similar multi-instance drivers.
> 
> Signed-off-by: Robin Murphy <robin.murphy@....com>
> ---
> 
> RFC because I don't have the means to test it, and if the general
> approach passes muster then I'd want to tackle the aforementioned
> factoring-out before merging anything anyway.
> 
> Robin.
> 
> 
> drivers/perf/arm_smmuv3_pmu.c | 215 ++++++++++++++++++++++++----------
> 1 file changed, 152 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
> index d704eccc548f..8daa7ac6e801 100644
> --- a/drivers/perf/arm_smmuv3_pmu.c
> +++ b/drivers/perf/arm_smmuv3_pmu.c
> @@ -47,8 +47,11 @@
> #include <linux/kernel.h>
> #include <linux/list.h>
> #include <linux/msi.h>
> +#include <linux/mutex.h>
> #include <linux/perf_event.h>
> #include <linux/platform_device.h>
> +#include <linux/rculist.h>
> +#include <linux/refcount.h>
> #include <linux/smp.h>
> #include <linux/sysfs.h>
> #include <linux/types.h>
> @@ -98,13 +101,24 @@
> 
> static int cpuhp_state_num;
> 
> -struct smmu_pmu {
> +static LIST_HEAD(smmu_pmu_affinities);
> +static DEFINE_MUTEX(smmu_pmu_affinity_lock);
> +
> +struct smmu_pmu_affinity {
> 	struct hlist_node node;
> +	struct list_head affinity_list;
> +	struct list_head instance_list;
> +	refcount_t refcount;
> +	unsigned int irq;
> +	unsigned int cpu;
> +};
> +
> +struct smmu_pmu {
> 	struct perf_event *events[SMMU_PMCG_MAX_COUNTERS];
> 	DECLARE_BITMAP(used_counters, SMMU_PMCG_MAX_COUNTERS);
> 	DECLARE_BITMAP(supported_events, SMMU_PMCG_ARCH_MAX_EVENTS);
> -	unsigned int irq;
> -	unsigned int on_cpu;
> +	struct smmu_pmu_affinity *affinity;
> +	struct list_head affinity_list;
> 	struct pmu pmu;
> 	unsigned int num_counters;
> 	struct device *dev;
> @@ -394,7 +408,7 @@ static int smmu_pmu_event_init(struct perf_event *event)
> 	 * Ensure all events are on the same cpu so all events are in the
> 	 * same cpu context, to avoid races on pmu_enable etc.
> 	 */
> -	event->cpu = smmu_pmu->on_cpu;
> +	event->cpu = smmu_pmu->affinity->cpu;
> 
> 	return 0;
> }
> @@ -478,9 +492,10 @@ static ssize_t smmu_pmu_cpumask_show(struct device *dev,
> 				     struct device_attribute *attr,
> 				     char *buf)
> {
> -	struct smmu_pmu *smmu_pmu = to_smmu_pmu(dev_get_drvdata(dev));
> +	struct pmu *pmu = dev_get_drvdata(dev);
> +	struct smmu_pmu_affinity *aff = to_smmu_pmu(pmu)->affinity;
> 
> -	return cpumap_print_to_pagebuf(true, buf, cpumask_of(smmu_pmu->on_cpu));
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(aff->cpu));
> }
> 
> static struct device_attribute smmu_pmu_cpumask_attr =
> @@ -584,50 +599,140 @@ static const struct attribute_group *smmu_pmu_attr_grps[] = {
> 
> static int smmu_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> {
> -	struct smmu_pmu *smmu_pmu;
> +	struct smmu_pmu_affinity *aff;
> +	struct smmu_pmu *pmu;
> 	unsigned int target;
> 
> -	smmu_pmu = hlist_entry_safe(node, struct smmu_pmu, node);
> -	if (cpu != smmu_pmu->on_cpu)
> +	aff = hlist_entry_safe(node, struct smmu_pmu_affinity, node);
> +	if (cpu != aff->cpu)
> 		return 0;
> 
> 	target = cpumask_any_but(cpu_online_mask, cpu);
> 	if (target >= nr_cpu_ids)
> 		return 0;
> 
> -	perf_pmu_migrate_context(&smmu_pmu->pmu, cpu, target);
> -	smmu_pmu->on_cpu = target;
> -	WARN_ON(irq_set_affinity_hint(smmu_pmu->irq, cpumask_of(target)));
> +	/* We're only reading, but this isn't the place to be involving RCU */
> +	mutex_lock(&smmu_pmu_affinity_lock);
> +	list_for_each_entry(pmu, &aff->instance_list, affinity_list)
> +		perf_pmu_migrate_context(&pmu->pmu, aff->cpu, target);
> +	mutex_unlock(&smmu_pmu_affinity_lock);
> +
> +	WARN_ON(irq_set_affinity_hint(aff->irq, cpumask_of(target)));
> +	aff->cpu = target;
> 
> 	return 0;
> }
> 
> static irqreturn_t smmu_pmu_handle_irq(int irq_num, void *data)
> {
> -	struct smmu_pmu *smmu_pmu = data;
> -	u64 ovsr;
> -	unsigned int idx;
> +	struct smmu_pmu_affinity *aff = data;
> +	struct smmu_pmu *smmu_pmu;
> +	irqreturn_t ret = IRQ_NONE;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(smmu_pmu, &aff->instance_list, affinity_list) {
> +		unsigned int idx;
> +		u64 ovsr = readq(smmu_pmu->reloc_base + SMMU_PMCG_OVSSET0);
> 
> -	ovsr = readq(smmu_pmu->reloc_base + SMMU_PMCG_OVSSET0);
> -	if (!ovsr)
> -		return IRQ_NONE;
> +		if (!ovsr)
> +			continue;
> 
> -	writeq(ovsr, smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
> +		writeq(ovsr, smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
> 
> -	for_each_set_bit(idx, (unsigned long *)&ovsr, smmu_pmu->num_counters) {
> -		struct perf_event *event = smmu_pmu->events[idx];
> -		struct hw_perf_event *hwc;
> +		for_each_set_bit(idx, (unsigned long *)&ovsr, smmu_pmu->num_counters) {
> +			struct perf_event *event = smmu_pmu->events[idx];
> 
> -		if (WARN_ON_ONCE(!event))
> -			continue;
> +			if (WARN_ON_ONCE(!event))
> +				continue;
> +
> +			smmu_pmu_event_update(event);
> +			smmu_pmu_set_period(smmu_pmu, &event->hw);
> +		}
> +		ret = IRQ_HANDLED;
> +	}
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +static struct smmu_pmu_affinity *__smmu_pmu_get_affinity(int irq)
> +{
> +	struct smmu_pmu_affinity *aff;
> +	int ret;
> +
> +	list_for_each_entry(aff, &smmu_pmu_affinities, affinity_list)
> +		if (aff->irq == irq && refcount_inc_not_zero(&aff->refcount))
> +			return aff;
> +
> +	aff = kzalloc(sizeof(*aff), GFP_KERNEL);
> +	if (!aff)
> +		return ERR_PTR(-ENOMEM);
> +
Do we need to initialize aff->instance_list?

> +	/* Pick one CPU to be the preferred one to use */
> +	aff->cpu = raw_smp_processor_id();
> +	refcount_set(&aff->refcount, 1);
> +
> +	ret = request_irq(irq, smmu_pmu_handle_irq,
> +			  IRQF_NOBALANCING | IRQF_NO_THREAD,
> +			  "smmuv3-pmu", aff);
Do we need to save irq to aff->irq?

> +	if (ret)
> +		goto out_free_aff;
> +
> +	ret = irq_set_affinity_hint(irq, cpumask_of(aff->cpu));
> +	if (ret)
> +		goto out_free_irq;
> +
> +	ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &aff->node);
> +	if (ret)
> +		goto out_free_irq;
> +
> +	list_add(&aff->affinity_list, &smmu_pmu_affinities);
> +	return aff;
> +
> +out_free_irq:
> +	free_irq(irq, aff);
> +out_free_aff:
> +	kfree(aff);
> +	return ERR_PTR(ret);
> +}
> +
> +static int smmu_pmu_get_affinity(struct smmu_pmu *pmu, int irq)
> +{
> +	struct smmu_pmu_affinity *aff;
> +
> +	mutex_lock(&smmu_pmu_affinity_lock);
> +	aff = __smmu_pmu_get_affinity(irq);
> +	mutex_unlock(&smmu_pmu_affinity_lock);
> +
> +	if (IS_ERR(aff))
> +		return PTR_ERR(aff);
> +
> +	pmu->affinity = aff;
> +	mutex_lock(&smmu_pmu_affinity_lock);
> +	list_add_rcu(&pmu->affinity_list, &aff->instance_list);
> +	mutex_unlock(&smmu_pmu_affinity_lock);
> +
> +	return 0;
> +}
> +
> +static void smmu_pmu_put_affinity(struct smmu_pmu *pmu)
> +{
> +	struct smmu_pmu_affinity *aff = pmu->affinity;
> 
> -		smmu_pmu_event_update(event);
> -		hwc = &event->hw;
> +	mutex_lock(&smmu_pmu_affinity_lock);
> +	list_del_rcu(&pmu->affinity_list);
> 
> -		smmu_pmu_set_period(smmu_pmu, hwc);
> +	if (!refcount_dec_and_test(&aff->refcount)) {
> +		mutex_unlock(&smmu_pmu_affinity_lock);
> +		return;
> 	}
> 
> -	return IRQ_HANDLED;
> +	list_del(&aff->affinity_list);
> +	mutex_unlock(&smmu_pmu_affinity_lock);
> +
> +	free_irq(aff->irq, aff);
> +	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &aff->node);
> +	kfree(aff);
> }
> 
> static void smmu_pmu_free_msis(void *data)
> @@ -652,7 +757,7 @@ static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> 		       pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
> }
> 
> -static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
> +static int smmu_pmu_setup_msi(struct smmu_pmu *pmu)
> {
> 	struct msi_desc *desc;
> 	struct device *dev = pmu->dev;
> @@ -663,34 +768,34 @@ static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
> 
> 	/* MSI supported or not */
> 	if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
> -		return;
> +		return 0;
> 
> 	ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
> 	if (ret) {
> 		dev_warn(dev, "failed to allocate MSIs\n");
> -		return;
> +		return ret;
> 	}
> 
> 	desc = first_msi_entry(dev);
> 	if (desc)
> -		pmu->irq = desc->irq;
> +		ret = desc->irq;
> 
> 	/* Add callback to free MSIs on teardown */
> 	devm_add_action(dev, smmu_pmu_free_msis, dev);
> +	return ret;
> }
> 
> static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
> {
> -	unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
> -	int irq, ret = -ENXIO;
> +	int irq;
> 
> -	smmu_pmu_setup_msi(pmu);
> +	irq = smmu_pmu_setup_msi(pmu);
> +	if (irq <= 0)
> +		irq = platform_get_irq(to_platform_device(pmu->dev), 0);
> +	if (irq < 0)
> +		return irq;
> 
> -	irq = pmu->irq;
> -	if (irq)
> -		ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
> -				       flags, "smmuv3-pmu", pmu);
> -	return ret;
> +	return smmu_pmu_get_affinity(pmu, irq);
> }
> 
> static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
> @@ -730,7 +835,7 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> 	struct resource *res_0;
> 	u32 cfgr, reg_size;
> 	u64 ceid_64[2];
> -	int irq, err;
> +	int err;
> 	char *name;
> 	struct device *dev = &pdev->dev;
> 
> @@ -771,10 +876,6 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> 		smmu_pmu->reloc_base = smmu_pmu->reg_base;
> 	}
> 
> -	irq = platform_get_irq(pdev, 0);
> -	if (irq > 0)
> -		smmu_pmu->irq = irq;
> -
> 	ceid_64[0] = readq_relaxed(smmu_pmu->reg_base + SMMU_PMCG_CEID0);
> 	ceid_64[1] = readq_relaxed(smmu_pmu->reg_base + SMMU_PMCG_CEID1);
> 	bitmap_from_arr32(smmu_pmu->supported_events, (u32 *)ceid_64,
> @@ -789,12 +890,6 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> 
> 	smmu_pmu_reset(smmu_pmu);
> 
> -	err = smmu_pmu_setup_irq(smmu_pmu);
> -	if (err) {
> -		dev_err(dev, "Setup irq failed, PMU @%pa\n", &res_0->start);
> -		return err;
> -	}
> -
> 	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "smmuv3_pmcg_%llx",
> 			      (res_0->start) >> SMMU_PMCG_PA_SHIFT);
> 	if (!name) {
> @@ -804,16 +899,9 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> 
> 	smmu_pmu_get_acpi_options(smmu_pmu);
> 
> -	/* Pick one CPU to be the preferred one to use */
> -	smmu_pmu->on_cpu = raw_smp_processor_id();
> -	WARN_ON(irq_set_affinity_hint(smmu_pmu->irq,
> -				      cpumask_of(smmu_pmu->on_cpu)));
> -
> -	err = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
> -					       &smmu_pmu->node);
> +	err = smmu_pmu_setup_irq(smmu_pmu);
> 	if (err) {
> -		dev_err(dev, "Error %d registering hotplug, PMU @%pa\n",
> -			err, &res_0->start);
> +		dev_err(dev, "Setup irq failed, PMU @%pa\n", &res_0->start);
> 		return err;
> 	}
> 
> @@ -832,7 +920,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> 	return 0;
> 
> out_unregister:
> -	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
> +	smmu_pmu_put_affinity(smmu_pmu);
> +	synchronize_rcu();
> 	return err;
> }
> 
> @@ -840,9 +929,9 @@ static int smmu_pmu_remove(struct platform_device *pdev)
> {
> 	struct smmu_pmu *smmu_pmu = platform_get_drvdata(pdev);
> 
> +	smmu_pmu_put_affinity(smmu_pmu);
> +	/* perf will synchronise RCU before devres can free smmu_pmu */
> 	perf_pmu_unregister(&smmu_pmu->pmu);
> -	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
> -
> 	return 0;
> }
> 
> -- 
> 2.23.0.dirty
>