lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <896f03f9-4878-4d20-9992-ef3f39253acc@arm.com>
Date: Wed, 15 Jan 2025 21:32:43 +0000
From: Robin Murphy <robin.murphy@....com>
To: Zong Li <zong.li@...ive.com>, joro@...tes.org, will@...nel.org,
 tjeznach@...osinc.com, paul.walmsley@...ive.com, palmer@...belt.com,
 aou@...s.berkeley.edu, luxu.kernel@...edance.com,
 linux-kernel@...r.kernel.org, iommu@...ts.linux.dev,
 linux-riscv@...ts.infradead.org
Subject: Re: [PATCH 1/2] iommu/riscv: add RISC-V IOMMU PMU support

On 2025-01-15 3:03 am, Zong Li wrote:
> Support for the RISC-V IOMMU hardware performance monitor includes
> both counting and sampling modes.
> 
> The specification does not define an event ID for counting the
> number of clock cycles, meaning there is no associated `iohpmevt0`.
> However, we need an event for counting cycle, so we reserve the
> maximum event ID for this purpose.

Why not use an extra config bit to encode cycles events completely 
independently of the regular eventID space? Or even just use 0 since by 
definition that cannot overlap a valid eventID?

> Signed-off-by: Zong Li <zong.li@...ive.com>
> Tested-by: Xu Lu <luxu.kernel@...edance.com>
> ---
>   drivers/iommu/riscv/Makefile     |   2 +-
>   drivers/iommu/riscv/iommu-bits.h |  16 +
>   drivers/iommu/riscv/iommu-pmu.c  | 486 +++++++++++++++++++++++++++++++
>   drivers/iommu/riscv/iommu.h      |   8 +
>   4 files changed, 511 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/iommu/riscv/iommu-pmu.c
> 
> diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> index f54c9ed17d41..d36625a1fd08 100644
> --- a/drivers/iommu/riscv/Makefile
> +++ b/drivers/iommu/riscv/Makefile
> @@ -1,3 +1,3 @@
>   # SPDX-License-Identifier: GPL-2.0-only
> -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
> +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
>   obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
> diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> index 98daf0e1a306..60523449f016 100644
> --- a/drivers/iommu/riscv/iommu-bits.h
> +++ b/drivers/iommu/riscv/iommu-bits.h
> @@ -17,6 +17,7 @@
>   #include <linux/types.h>
>   #include <linux/bitfield.h>
>   #include <linux/bits.h>
> +#include <linux/perf_event.h>
>   
>   /*
>    * Chapter 5: Memory Mapped register interface
> @@ -207,6 +208,7 @@ enum riscv_iommu_ddtp_modes {
>   /* 5.22 Performance monitoring event counters (31 * 64bits) */
>   #define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
>   #define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> +#define RISCV_IOMMU_IOHPMCTR_COUNTER	GENMASK_ULL(63, 0)
>   
>   /* 5.23 Performance monitoring event selectors (31 * 64bits) */
>   #define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
> @@ -250,6 +252,20 @@ enum riscv_iommu_hpmevent_id {
>   	RISCV_IOMMU_HPMEVENT_MAX        = 9
>   };
>   
> +/* Use maximum event ID for cycle event */
> +#define RISCV_IOMMU_HPMEVENT_CYCLE	GENMASK_ULL(14, 0)

It's also horribly confusing to use GENMASK for something which is not 
actually a mask at all.

> +#define RISCV_IOMMU_HPM_COUNTER_NUM	32
> +
> +struct riscv_iommu_pmu {
> +	struct pmu pmu;
> +	void __iomem *reg;
> +	int num_counters;
> +	u64 mask_counter;
> +	struct perf_event *events[RISCV_IOMMU_IOHPMEVT_CNT + 1];
> +	DECLARE_BITMAP(used_counters, RISCV_IOMMU_IOHPMEVT_CNT + 1);

Hmm, from experience I would expect these variables to be 
number-of-counters related, but I guess there must be some special 
cleverness going on since that number-of-counters looking 
RISCV_IOMMU_HPM_COUNTER_NUM definition is conspicuously not used, so it 
must be significant that these are instead related to the number of...

[ goes off to search code... ]

...event selectors? But with a magic +1 for reasons so obvious they 
clearly don't need explaining.

> +};
> +
>   /* 5.24 Translation request IOVA (64bits) */
>   #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
>   #define RISCV_IOMMU_TR_REQ_IOVA_VPN	GENMASK_ULL(63, 12)
> diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
> new file mode 100644
> index 000000000000..74eb1525cd32
> --- /dev/null
> +++ b/drivers/iommu/riscv/iommu-pmu.c
> @@ -0,0 +1,486 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2024 SiFive
> + *
> + * Authors
> + *	Zong Li <zong.li@...ive.com>
> + */
> +
> +#include <linux/io-64-nonatomic-hi-lo.h>
> +
> +#include "iommu.h"
> +#include "iommu-bits.h"
> +
> +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> +
> +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)			\
> +	static inline u32 get_##_name(struct perf_event *event)		\
> +	{								\
> +		return FIELD_GET(_mask, event->attr.config);		\
> +	}								\
> +
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> +
> +/* Formats */
> +PMU_FORMAT_ATTR(event, "config:0-14");
> +PMU_FORMAT_ATTR(partial_matching, "config:15");
> +PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
> +PMU_FORMAT_ATTR(did_gscid, "config:36-59");
> +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> +PMU_FORMAT_ATTR(filter_id_type, "config:62");
> +
> +static struct attribute *riscv_iommu_pmu_formats[] = {
> +	&format_attr_event.attr,
> +	&format_attr_partial_matching.attr,
> +	&format_attr_pid_pscid.attr,
> +	&format_attr_did_gscid.attr,
> +	&format_attr_filter_pid_pscid.attr,
> +	&format_attr_filter_did_gscid.attr,
> +	&format_attr_filter_id_type.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_format_group = {
> +	.name = "format",
> +	.attrs = riscv_iommu_pmu_formats,
> +};
> +
> +/* Events */
> +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> +					  struct device_attribute *attr,
> +					  char *page)
> +{
> +	struct perf_pmu_events_attr *pmu_attr;
> +
> +	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> +
> +	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);

sysfs_emit()

> +}
> +
> +PMU_EVENT_ATTR(cycle, event_attr_cycle,
> +	       RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
> +	       RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
> +	       RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
> +	       RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
> +	       RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
> +	       RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
> +	       RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
> +	       RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
> +	       RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
> +	       RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
> +
> +static struct attribute *riscv_iommu_pmu_events[] = {
> +	&event_attr_cycle.attr.attr,
> +	&event_attr_dont_count.attr.attr,
> +	&event_attr_untranslated_req.attr.attr,
> +	&event_attr_translated_req.attr.attr,
> +	&event_attr_ats_trans_req.attr.attr,
> +	&event_attr_tlb_miss.attr.attr,
> +	&event_attr_ddt_walks.attr.attr,
> +	&event_attr_pdt_walks.attr.attr,
> +	&event_attr_s_vs_pt_walks.attr.attr,
> +	&event_attr_g_pt_walks.attr.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_events_group = {
> +	.name = "events",
> +	.attrs = riscv_iommu_pmu_events,
> +};
> +
> +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> +	&riscv_iommu_pmu_format_group,
> +	&riscv_iommu_pmu_events_group,
> +	NULL,
> +};

You also need a "cpumask" attribute to tell userspace this is a 
system/uncore PMU.

> +
> +/* PMU Operations */
> +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> +					u64 value)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +
> +	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))

One of those conditions is literally impossible, the other should 
already be avoided by construction.

> +		return;
> +
> +	if (idx == 0)
> +		value = (value & ~RISCV_IOMMU_IOHPMCYCLES_OF) |
> +			 (readq(addr) & RISCV_IOMMU_IOHPMCYCLES_OF);

I don't think you need this - as best I can tell, you never initialise a 
counter without also (re)enabling the interrupt (which is logical), so 
since "value" for the cycle counter should always implicitly have OF=0 
anyway, it should work to just write it like the other counters.

> +	writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);

Is RISCV_IOMMU_IOHPMCTR_COUNTER honestly useful? Or is it actively 
hurting readability by obfuscating that we are in fact just using the 
full 64-bit value in all those places?

> +}
> +
> +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +	u64 value;
> +
> +	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +		return -EINVAL;
> +
> +	value = readq(addr + idx * 8);

Might be worth leaving a comment just in case anyone does try to enable 
this for 32-bit that the io-64-nonatomic readq() isn't enough to work 
properly here.

> +
> +	if (idx == 0)
> +		return FIELD_GET(RISCV_IOMMU_IOHPMCYCLES_COUNTER, value);
> +
> +	return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
> +}
> +
> +static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> +
> +	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +		return 0;
> +
> +	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +	if (idx == 0)
> +		return 0;
> +
> +	return readq(addr + (idx - 1) * 8);
> +}

That's a wonderfully expensive way to spell "pmu->events[idx]->hw.config"...

> +
> +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> +				      u64 value)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> +
> +	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +		return;
> +
> +	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +	if (idx == 0)
> +		return;
> +
> +	writeq(value, addr + (idx - 1) * 8);
> +}
> +
> +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value & ~BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value | BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	u64 value;
> +
> +	if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {

And that's a very creative way to spell "if (idx == 0)".

> +		value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OF;
> +		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);

(and as above, I think you can make this a no-op for the cycle counter)

> +	} else {
> +		value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
> +		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> +	}
> +}
> +
> +static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	u64 value;
> +
> +	if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> +		value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OF;
> +		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> +	} else {
> +		value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
> +		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> +	}
> +}

TBH I'd be inclined to just leave out all the dead cleanup code if the 
PMU driver is tied to the IOMMU driver and can never realistically be 
removed.

> +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> +{
> +	int idx;
> +
> +	for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> +		riscv_iommu_pmu_enable_ovf_intr(pmu, idx);

Surely this should only touch the counter(s) that overflowed and have 
been handled? It might be cleaner to keep that within the IRQ handler 
itself.

> +		riscv_iommu_pmu_enable_counter(pmu, idx);

This needs to start all active counters together, not one-by-one.

> +	}
> +}
> +
> +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> +{
> +	writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> +	       pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> +}
> +
> +/* PMU APIs */
> +static int riscv_iommu_pmu_set_period(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	s64 left = local64_read(&hwc->period_left);
> +	s64 period = hwc->sample_period;
> +	u64 max_period = pmu->mask_counter;
> +	int ret = 0;
> +
> +	if (unlikely(left <= -period)) {
> +		left = period;
> +		local64_set(&hwc->period_left, left);
> +		hwc->last_period = period;
> +		ret = 1;
> +	}
> +
> +	if (unlikely(left <= 0)) {
> +		left += period;
> +		local64_set(&hwc->period_left, left);
> +		hwc->last_period = period;
> +		ret = 1;
> +	}

None of this is relevant or necessary. This is not a CPU PMU, so it 
can't support sampling because it doesn't have a meaningful context to 
sample.

> +	/*
> +	 * Limit the maximum period to prevent the counter value
> +	 * from overtaking the one we are about to program. In
> +	 * effect we are reducing max_period to account for
> +	 * interrupt latency (and we are being very conservative).
> +	 */
> +	if (left > (max_period >> 1))
> +		left = (max_period >> 1);
> +
> +	local64_set(&hwc->prev_count, (u64)-left);
> +	riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
> +	perf_event_update_userpage(event);
> +
> +	return ret;
> +}
> +
> +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;

You first need to validate that the event is for this PMU at all, and 
return -ENOENT if not.

> +	hwc->idx = -1;
> +	hwc->config = event->attr.config;
> +
> +	if (!is_sampling_event(event)) {

As above, you can't support sampling events anyway, so you should reject 
them with -EINVAL.

You also need to validate event groups to ensure they don't contain more 
events than could ever be scheduled at once.

> +		/*
> +		 * For non-sampling runs, limit the sample_period to half
> +		 * of the counter width. That way, the new counter value
> +		 * is far less likely to overtake the previous one unless
> +		 * you have some serious IRQ latency issues.
> +		 */
> +		hwc->sample_period = pmu->mask_counter >> 1;
> +		hwc->last_period = hwc->sample_period;
> +		local64_set(&hwc->period_left, hwc->sample_period);
> +	}
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	u64 delta, prev, now;
> +	u32 idx = hwc->idx;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +		now = riscv_iommu_pmu_get_counter(pmu, idx);
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev) & pmu->mask_counter;
> +	local64_add(delta, &event->count);
> +	local64_sub(delta, &hwc->period_left);
> +}
> +
> +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> +		return;
> +
> +	if (flags & PERF_EF_RELOAD)
> +		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> +	hwc->state = 0;
> +	riscv_iommu_pmu_set_period(event);
> +	riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> +	riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);

This does nothing, since set_event has just implicitly written OF=0.

...unless, that is, the user was mischievous and also set bit 63 in 
event->attr.config, since you never sanitised the input ;)

> +	riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (hwc->state & PERF_HES_STOPPED)
> +		return;
> +
> +	riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
> +	riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
> +
> +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> +		riscv_iommu_pmu_update(event);
> +
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	unsigned int num_counters = pmu->num_counters;
> +	int idx;
> +
> +	/* Reserve index zero for iohpmcycles */
> +	if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
> +		idx = 0;
> +	else
> +		idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> +
> +	if (idx == num_counters)
> +		return -EAGAIN;

But stomping a cycles event on top of an existing cycles event is fine?

> +	set_bit(idx, pmu->used_counters);
> +
> +	pmu->events[idx] = event;
> +	hwc->idx = idx;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +	if (flags & PERF_EF_START)
> +		riscv_iommu_pmu_start(event, flags);
> +
> +	/* Propagate changes to the userspace mapping. */
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_read(struct perf_event *event)
> +{
> +	riscv_iommu_pmu_update(event);
> +}
> +
> +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	int idx = hwc->idx;
> +
> +	riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> +	pmu->events[idx] = NULL;
> +	clear_bit(idx, pmu->used_counters);
> +	perf_event_update_userpage(event);
> +}
> +
> +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> +{
> +	struct perf_sample_data data;
> +	struct pt_regs *regs;
> +	u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> +	int idx;
> +
> +	if (!ovf)
> +		return IRQ_NONE;
> +
> +	riscv_iommu_pmu_stop_all(pmu);
> +
> +	regs = get_irq_regs();

What do these regs represent? If you look at the perf_event_open ABI, 
you'll see that events can target various combinations of CPU and/or 
pid, but there is no encoding for "whichever CPU takes the IOMMU PMU 
interrupt". Thus whatever you get here is more than likely not what the 
user asked for, and this is why non-CPU PMUs cannot reasonably support 
sampling ;)

> +
> +	for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> +		struct perf_event *event = pmu->events[idx];
> +		struct hw_perf_event *hwc;
> +
> +		if (WARN_ON_ONCE(!event) || !is_sampling_event(event))

You still need to handle counter rollover for all events, otherwise they 
can start losing counts and rapidly turn to nonsense. Admittedly it's 
largely theoretical with full 64-bit counters, but still...

> +			continue;
> +
> +		hwc = &event->hw;
> +
> +		riscv_iommu_pmu_update(event);
> +		perf_sample_data_init(&data, 0, hwc->last_period);
> +		if (!riscv_iommu_pmu_set_period(event))
> +			continue;
> +
> +		if (perf_event_overflow(event, &data, regs))
> +			riscv_iommu_pmu_stop(event, 0);
> +	}
> +
> +	riscv_iommu_pmu_start_all(pmu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> +			 const char *dev_name)
> +{
> +	char *name;
> +	int ret;
> +
> +	pmu->reg = reg;
> +	pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
> +	pmu->mask_counter = RISCV_IOMMU_IOHPMCTR_COUNTER;

Initially this looks weird - why bother storing constants in memory if 
they're constant? - but I see the spec implies they are not necessarily 
fixed, and we can only actually assume at least one counter at least 32 
bits wide, so I guess this is really more of a placeholder? Is there a 
well-defined way we're supposed to be able to discover these, like some 
more ID register fields somewhere, or writing all 1s to various 
registers to see what sticks, or is it liable to be a mess of just 
having to know what each implementation has by matching DT compatibles 
and/or PCI IDs?

> +
> +	pmu->pmu = (struct pmu) {
> +		.task_ctx_nr	= perf_invalid_context,
> +		.event_init	= riscv_iommu_pmu_event_init,
> +		.add		= riscv_iommu_pmu_add,
> +		.del		= riscv_iommu_pmu_del,
> +		.start		= riscv_iommu_pmu_start,
> +		.stop		= riscv_iommu_pmu_stop,
> +		.read		= riscv_iommu_pmu_read,
> +		.attr_groups	= riscv_iommu_pmu_attr_grps,
> +		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +		.module		= THIS_MODULE,
> +	};

The new thing is that you can now set pmu.parent to the IOMMU device so 
their relationship is clear in sysfs, rather than having to play tricks 
with the PMU name.

> +
> +	name = kasprintf(GFP_KERNEL, "riscv_iommu_pmu_%s", dev_name);
> +
> +	ret = perf_pmu_register(&pmu->pmu, name, -1);
> +	if (ret) {
> +		pr_err("Failed to register riscv_iommu_pmu_%s: %d\n",
> +		       dev_name, ret);
> +		return ret;
> +	}
> +
> +	/* Stop all counters and later start the counter with perf */
> +	riscv_iommu_pmu_stop_all(pmu);
> +
> +	pr_info("riscv_iommu_pmu_%s: Registered with %d counters\n",
> +		dev_name, pmu->num_counters);
> +
> +	return 0;
> +}
> +
> +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
> +{
> +	int idx;
> +
> +	/* Disable interrupt and functions */
> +	for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {

This will never do anything, since even if we could ever get here, it 
would not be at a time when there are any active events. Unregistering 
an in-use PMU does not end well (hence why standalone PMU drivers need 
to use suppress_bind_attrs)...

Thanks,
Robin.

> +		riscv_iommu_pmu_disable_counter(pmu, idx);
> +		riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
> +	}
> +
> +	perf_pmu_unregister(&pmu->pmu);
> +}
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index b1c4664542b4..92659a8a75ae 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -60,11 +60,19 @@ struct riscv_iommu_device {
>   	unsigned int ddt_mode;
>   	dma_addr_t ddt_phys;
>   	u64 *ddt_root;
> +
> +	/* hardware performance monitor */
> +	struct riscv_iommu_pmu pmu;
>   };
>   
>   int riscv_iommu_init(struct riscv_iommu_device *iommu);
>   void riscv_iommu_remove(struct riscv_iommu_device *iommu);
>   
> +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> +			 const char *name);
> +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
> +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
> +
>   #define riscv_iommu_readl(iommu, addr) \
>   	readl_relaxed((iommu)->reg + (addr))
>   


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ