[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <63905ff2-ee69-39de-507f-b02ebad39444@linux.intel.com>
Date: Fri, 13 Jan 2023 20:58:39 +0800
From: Baolu Lu <baolu.lu@...ux.intel.com>
To: kan.liang@...ux.intel.com, joro@...tes.org, will@...nel.org,
dwmw2@...radead.org, robin.murphy@....com, robert.moore@...el.com,
rafael.j.wysocki@...el.com, lenb@...nel.org, iommu@...ts.linux.dev,
linux-kernel@...r.kernel.org
Cc: baolu.lu@...ux.intel.com
Subject: Re: [PATCH 2/7] iommu/vt-d: Retrieve IOMMU perfmon capability
information
On 2023/1/12 4:24, kan.liang@...ux.intel.com wrote:
> From: Kan Liang <kan.liang@...ux.intel.com>
>
> The performance monitoring infrastructure, perfmon, is to support
> collection of information about key events occurring during operation of
> the remapping hardware, to aid performance tuning and debug. Each
> remapping hardware unit has capability registers that indicate support
> for performance monitoring features and enumerate the capabilities.
>
> Add alloc_iommu_pmu() to retrieve IOMMU perfmon capability information
> for each iommu unit. The information is stored in the iommu->pmu data
> structure. Capability registers are read-only, so it's safe to prefetch
> and store them in the pmu structure. This could avoid unnecessary VMEXIT
> when this code is running in the virtualization environment.
>
> Add free_iommu_pmu() to free the saved capability information when
> freeing the iommu unit.
>
> Add a kernel config option for the IOMMU perfmon feature. Unless a user
> explicitly uses the perf tool to monitor the IOMMU perfmon event, there
> isn't any impact for the existing IOMMU. Enable it by default.
>
> Signed-off-by: Kan Liang <kan.liang@...ux.intel.com>
> ---
> drivers/iommu/intel/Kconfig | 9 ++
> drivers/iommu/intel/Makefile | 1 +
> drivers/iommu/intel/dmar.c | 7 ++
> drivers/iommu/intel/iommu.h | 41 +++++++++
> drivers/iommu/intel/perfmon.c | 159 ++++++++++++++++++++++++++++++++++
> drivers/iommu/intel/perfmon.h | 41 +++++++++
> 6 files changed, 258 insertions(+)
> create mode 100644 drivers/iommu/intel/perfmon.c
> create mode 100644 drivers/iommu/intel/perfmon.h
>
> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
> index b7dff5092fd2..1a4aebddc9a6 100644
> --- a/drivers/iommu/intel/Kconfig
> +++ b/drivers/iommu/intel/Kconfig
> @@ -96,4 +96,13 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
> passing intel_iommu=sm_on to the kernel. If not sure, please use
> the default value.
>
> +config INTEL_IOMMU_PERF_EVENTS
> + def_bool y
> + bool "Intel IOMMU performance events"
> + depends on INTEL_IOMMU && PERF_EVENTS
> + help
> + Include support for Intel IOMMU performance events. These are
> + available on modern processors which support Intel VT-d 4.0 and
> + later.
> +
> endif # INTEL_IOMMU
> diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
> index fa0dae16441c..7af3b8a4f2a0 100644
> --- a/drivers/iommu/intel/Makefile
> +++ b/drivers/iommu/intel/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_DMAR_PERF) += perf.o
> obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
> obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
> obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
> +obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index 6a411d964474..91bb48267df2 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -34,6 +34,7 @@
> #include "../irq_remapping.h"
> #include "perf.h"
> #include "trace.h"
> +#include "perfmon.h"
>
> typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
> struct dmar_res_callback {
> @@ -1114,6 +1115,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
> if (sts & DMA_GSTS_QIES)
> iommu->gcmd |= DMA_GCMD_QIE;
>
> + if (alloc_iommu_pmu(iommu))
> + pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n", iommu->seq_id);
> +
> raw_spin_lock_init(&iommu->register_lock);
>
> /*
> @@ -1148,6 +1152,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
> err_sysfs:
> iommu_device_sysfs_remove(&iommu->iommu);
> err_unmap:
> + free_iommu_pmu(iommu);
> unmap_iommu(iommu);
> error_free_seq_id:
> ida_free(&dmar_seq_ids, iommu->seq_id);
> @@ -1163,6 +1168,8 @@ static void free_iommu(struct intel_iommu *iommu)
> iommu_device_sysfs_remove(&iommu->iommu);
> }
>
> + free_iommu_pmu(iommu);
> +
> if (iommu->irq) {
> if (iommu->pr_irq) {
> free_irq(iommu->pr_irq, iommu);
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 06e61e474856..5bcefbea55c9 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -125,6 +125,11 @@
> #define DMAR_MTRR_PHYSMASK8_REG 0x208
> #define DMAR_MTRR_PHYSBASE9_REG 0x210
> #define DMAR_MTRR_PHYSMASK9_REG 0x218
> +#define DMAR_PERFCAP_REG 0x300
> +#define DMAR_PERFCFGOFF_REG 0x310
> +#define DMAR_PERFOVFOFF_REG 0x318
> +#define DMAR_PERFCNTROFF_REG 0x31c
> +#define DMAR_PERFEVNTCAP_REG 0x380
> #define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
> #define DMAR_VCMD_REG 0xe00 /* Virtual command register */
> #define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
> @@ -148,6 +153,7 @@
> */
> #define cap_esrtps(c) (((c) >> 63) & 1)
> #define cap_esirtps(c) (((c) >> 62) & 1)
> +#define cap_ecmds(c) (((c) >> 61) & 1)
> #define cap_fl5lp_support(c) (((c) >> 60) & 1)
> #define cap_pi_support(c) (((c) >> 59) & 1)
> #define cap_fl1gp_support(c) (((c) >> 56) & 1)
> @@ -179,6 +185,7 @@
> * Extended Capability Register
> */
>
> +#define ecap_pms(e) (((e) >> 51) & 0x1)
> #define ecap_rps(e) (((e) >> 49) & 0x1)
> #define ecap_smpwc(e) (((e) >> 48) & 0x1)
> #define ecap_flts(e) (((e) >> 47) & 0x1)
> @@ -210,6 +217,22 @@
> #define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
> #define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */
>
> +/*
> + * Decoding Perf Capability Register
> + */
> +#define pcap_num_cntr(p) ((p) & 0xffff)
> +#define pcap_cntr_width(p) (((p) >> 16) & 0x7f)
> +#define pcap_num_event_group(p) (((p) >> 24) & 0x1f)
> +#define pcap_filters_mask(p) (((p) >> 32) & 0x1f)
> +#define pcap_interrupt(p) (((p) >> 50) & 0x1)
> +/* The counter stride is calculated as 2 ^ (x+10) bytes */
> +#define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10))
> +
> +/*
> + * Decoding Perf Event Capability Register
> + */
> +#define pecap_es(p) ((p) & 0xfffffff)
> +
> /* Virtual command interface capability */
> #define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */
>
> @@ -554,6 +577,22 @@ struct dmar_domain {
> iommu core */
> };
>
> +struct iommu_pmu {
> + struct intel_iommu *iommu;
> + u32 num_cntr; /* Number of counters */
> + u32 num_eg; /* Number of event group */
> + u32 cntr_width; /* Counter width */
> + u32 cntr_stride; /* Counter Stride */
> + u32 filter; /* Bitmask of filter support */
> + void __iomem *base; /* the PerfMon base address */
> + void __iomem *cfg_reg; /* counter configuration base address */
> + void __iomem *cntr_reg; /* counter 0 address*/
> + void __iomem *overflow; /* overflow status register */
> +
> + u64 *evcap; /* Indicates all supported events */
> + u32 **cntr_evcap; /* Supported events of each counter. */
> +};
> +
> struct intel_iommu {
> void __iomem *reg; /* Pointer to hardware regs, virtual addr */
> u64 reg_phys; /* physical address of hw register set */
> @@ -600,6 +639,8 @@ struct intel_iommu {
>
> struct dmar_drhd_unit *drhd;
> void *perf_statistic;
> +
> + struct iommu_pmu *pmu;
> };
>
> /* PCI domain-device relationship */
> diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
> new file mode 100644
> index 000000000000..bc090f329c32
> --- /dev/null
> +++ b/drivers/iommu/intel/perfmon.c
> @@ -0,0 +1,159 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Support Intel IOMMU PerfMon
> + * Copyright(c) 2022 Intel Corporation.
Copyright 2023
> + */
> +
> +#include <linux/dmar.h>
> +#include "iommu.h"
> +#include "perfmon.h"
> +
> +static inline void __iomem *
> +get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
> +{
> + u32 off = dmar_readl(iommu->reg + offset);
> +
> + return iommu->reg + off;
> +}
> +
> +int alloc_iommu_pmu(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu;
> + int i, j, ret;
> + u64 perfcap;
> + u32 cap;
> +
> + /* The IOMMU PMU requires the ECMD support as well */
> + if (!ecap_pms(iommu->ecap) || !cap_ecmds(iommu->cap))
> + return -ENODEV;
It's normal that PMS is not supported on an IOMMU, how about,
if (!ecap_pms(iommu->ecap))
return 0;
/* The IOMMU PMU requires the ECMD support as well */
if (!cap_ecmds(iommu->cap))
return -ENODEV;
> +
> + perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
> + /* The performance monitoring is not supported. */
> + if (!perfcap)
> + return -ENODEV;
> +
> + /* Sanity check for the number of the counters and event groups */
> + if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
> + return -ENODEV;
> +
> + /* The interrupt on overflow is required */
> + if (!pcap_interrupt(perfcap))
> + return -ENODEV;
> +
> + iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
> + if (!iommu_pmu)
> + return -ENOMEM;
> +
> + iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
> + iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
> + iommu_pmu->filter = pcap_filters_mask(perfcap);
> + iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
> + iommu_pmu->num_eg = pcap_num_event_group(perfcap);
> +
> + iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
> + if (!iommu_pmu->evcap) {
> + ret = -ENOMEM;
> + goto free_pmu;
> + }
> +
> + /* Parse event group capabilities */
> + for (i = 0; i < iommu_pmu->num_eg; i++) {
> + u64 pcap;
> +
> + pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
> + i * IOMMU_PMU_CAP_REGS_STEP);
> + iommu_pmu->evcap[i] = pecap_es(pcap);
> + }
> +
> + iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
> + if (!iommu_pmu->cntr_evcap) {
> + ret = -ENOMEM;
> + goto free_pmu_evcap;
> + }
> + for (i = 0; i < iommu_pmu->num_cntr; i++) {
> + iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
> + if (!iommu_pmu->cntr_evcap[i]) {
> + ret = -ENOMEM;
> + iommu_pmu->num_cntr = i;
Do we really need above line? kfree() is friendly to a NULL pointer,
right?
> + goto free_pmu_cntr_evcap;
> + }
> + /*
> + * Set to the global capabilities, will adjust according
> + * to per-counter capabilities later.
> + */
> + for (j = 0; j < iommu_pmu->num_eg; j++)
> + iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
> + }
> +
> + iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
> + iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
> + iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);
> +
> + /*
> + * Check per-counter capabilities
> + * All counters should have the same capabilities on
> + * Interrupt on Overflow Support and Counter Width
> + */
Please re-org this comment and make it neat.
> + for (i = 0; i < iommu_pmu->num_cntr; i++) {
> + cap = dmar_readl(iommu_pmu->cfg_reg +
> + i * IOMMU_PMU_CFG_OFFSET +
> + IOMMU_PMU_CFG_CNTRCAP_OFFSET);
> + if (!iommu_cntrcap_pcc(cap))
> + continue;
> + if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
> + !iommu_cntrcap_ios(cap))
> + iommu_pmu->num_cntr = i;
Can you please add some words describing how do you handle the
capability inconsistent case? It seems that you just truncate the number
of counters? Any rationality behind that?
> +
> + /* Clear the pre-defined events group */
> + for (j = 0; j < iommu_pmu->num_eg; j++)
> + iommu_pmu->cntr_evcap[i][j] = 0;
> +
> + /* Override with per-counter event capabilities */
> + for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
> + cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
> + IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
> + (j * IOMMU_PMU_OFF_REGS_STEP));
> + iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
> + /*
> + * Some events may only be supported by a specific counter.
> + * Track them in the evcap as well.
> + */
> + iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
> + }
> + }
> +
> + iommu_pmu->iommu = iommu;
> + iommu->pmu = iommu_pmu;
> +
> + return 0;
> +
> +free_pmu_cntr_evcap:
> + for (i = 0; i < iommu_pmu->num_cntr; i++)
> + kfree(iommu_pmu->cntr_evcap[i]);
> + kfree(iommu_pmu->cntr_evcap);
> +free_pmu_evcap:
> + kfree(iommu_pmu->evcap);
> +free_pmu:
> + kfree(iommu_pmu);
> +
> + return ret;
> +}
> +
> +void free_iommu_pmu(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu = iommu->pmu;
> +
> + if (!iommu_pmu)
> + return;
> +
> + if (iommu_pmu->evcap) {
> + int i;
> +
> + for (i = 0; i < iommu_pmu->num_cntr; i++)
> + kfree(iommu_pmu->cntr_evcap[i]);
> + kfree(iommu_pmu->cntr_evcap);
> + }
> + kfree(iommu_pmu->evcap);
> + kfree(iommu_pmu);
> + iommu->pmu = NULL;
> +}
> diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h
> new file mode 100644
> index 000000000000..8587c80501cd
> --- /dev/null
> +++ b/drivers/iommu/intel/perfmon.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * PERFCFGOFF_REG, PERFFRZOFF_REG
> + * PERFOVFOFF_REG, PERFCNTROFF_REG
> + */
> +#define IOMMU_PMU_NUM_OFF_REGS 4
> +#define IOMMU_PMU_OFF_REGS_STEP 4
> +
> +#define IOMMU_PMU_CFG_OFFSET 0x100
> +#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80
> +#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84
> +#define IOMMU_PMU_CFG_SIZE 0x8
> +#define IOMMU_PMU_CFG_FILTERS_OFFSET 0x4
> +
> +
> +#define IOMMU_PMU_CAP_REGS_STEP 8
> +
> +#define iommu_cntrcap_pcc(p) ((p) & 0x1)
> +#define iommu_cntrcap_cw(p) ((p >> 8) & 0xff)
> +#define iommu_cntrcap_ios(p) ((p >> 16) & 0x1)
> +#define iommu_cntrcap_egcnt(p) ((p >> 28) & 0xf)
> +
> +#define iommu_event_select(p) ((p) & 0xfffffff)
> +#define iommu_event_group(p) ((p >> 28) & 0xf)
> +
> +#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
> +int alloc_iommu_pmu(struct intel_iommu *iommu);
> +void free_iommu_pmu(struct intel_iommu *iommu);
> +#else
> +static inline int
> +alloc_iommu_pmu(struct intel_iommu *iommu)
> +{
> + return 0;
> +}
> +
> +static inline void
> +free_iommu_pmu(struct intel_iommu *iommu)
> +{
> +}
> +#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */
--
Best regards,
baolu
Powered by blists - more mailing lists