[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c30b50066aa0910538bf3cacd046d9c58984fb60.camel@redhat.com>
Date: Mon, 03 Mar 2025 08:49:08 +0100
From: Philipp Stanner <pstanner@...hat.com>
To: Wathsala Vithanage <wathsala.vithanage@....com>,
linux-kernel@...r.kernel.org
Cc: nd@....com, Alex Williamson <alex.williamson@...hat.com>, Jason
Gunthorpe <jgg@...pe.ca>, Kevin Tian <kevin.tian@...el.com>, Yunxiang Li
<Yunxiang.Li@....com>, "Dr. David Alan Gilbert" <linux@...blig.org>, Ankit
Agrawal <ankita@...dia.com>, "open list:VFIO DRIVER" <kvm@...r.kernel.org>
Subject: Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
On Fri, 2025-02-21 at 22:46 +0000, Wathsala Vithanage wrote:
> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature
> for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This
> feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is
> sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit
> from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
>
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices
> and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
>
> [1]
> lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@....com
> [2]
> inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@....com
>
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@....com>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 163
> +++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 68 +++++++++++++
> 2 files changed, 231 insertions(+)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c
> b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
> #include <linux/nospec.h>
> #include <linux/sched/mm.h>
> #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
> #if IS_ENABLED(CONFIG_EEH)
> #include <asm/eeh.h>
> #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct
> vfio_device *device, u32 flags,
> return 0;
> }
>
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> + void __user *arg, size_t
> argsz,
> + struct vfio_pci_tph_info
> **info)
> +{
> + size_t minsz;
> +
> + if (tph->count > VFIO_TPH_INFO_MAX)
> + return -EINVAL;
> + if (!tph->count)
> + return 0;
> +
> + minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> + if (minsz < argsz)
> + return -EINVAL;
> +
> + *info = memdup_user(arg, minsz);
You can use memdup_array_user() instead of the lines above. It does the
multiplication plus overflow check for you and will make your code more
compact.
> + if (IS_ERR(info))
> + return PTR_ERR(info);
> +
> + return minsz;
see below…
> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device
> *vdev,
> + struct vfio_pci_tph *tph,
> + void __user *arg, size_t
> argsz)
> +{
> + int i, mtype, err = 0;
> + u32 cpu_uid;
> + struct vfio_pci_tph_info *info = NULL;
> + ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz,
> &info);
> +
> + if (data_size <= 0)
> + return data_size;
So it seems you return here in case of an error. However, that would
result in a length of 0 being an error?
I would try to avoid to return 0 for an error whenever possible. That
breaks convention.
How about you return the result value of memdup_array_user() in
…uinfo_dup()?
The only thing I can't tell is whether tph->count == 0 should be
treated as an error. Maybe map it to -EINVAL?
Regards,
P.
> +
> + for (i = 0; i < tph->count; i++) {
> + if (!(info[i].cpu_id < nr_cpu_ids &&
> cpu_present(info[i].cpu_id))) {
> + info[i].err = -EINVAL;
> + continue;
> + }
> + cpu_uid = topology_core_id(info[i].cpu_id);
> + mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> + VFIO_TPH_MEM_TYPE_SHIFT;
> +
> + /* processing hints are always ignored */
> + info[i].ph_ignore = 1;
> +
> + info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype,
> cpu_uid,
> + &info[i].st);
> + if (info[i].err)
> + continue;
> +
> + if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> + info[i].err = pcie_tph_set_st_entry(vdev-
> >pdev,
> +
> info[i].index,
> +
> info[i].st);
> + }
> + }
> +
> + if (copy_to_user(arg, info, data_size))
> + err = -EFAULT;
> +
> + kfree(info);
> + return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device
> *vdev,
> + struct vfio_pci_tph *arg)
> +{
> + int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> + switch (mode) {
> + case VFIO_TPH_ST_NS_MODE:
> + return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_NS_MODE);
> +
> + case VFIO_TPH_ST_IV_MODE:
> + return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_IV_MODE);
> +
> + case VFIO_TPH_ST_DS_MODE:
> + return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_DS_MODE);
> +
> + default:
> + return -EINVAL;
> + }
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device
> *vdev)
> +{
> + pcie_disable_tph(vdev->pdev);
> + return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user
> *arg,
> + size_t argsz, u32 flags,
> + struct vfio_pci_tph *tph)
> +{
> + u32 op;
> + int err = vfio_check_feature(flags, argsz,
> + VFIO_DEVICE_FEATURE_SET |
> + VFIO_DEVICE_FEATURE_GET,
> + sizeof(struct vfio_pci_tph));
> + if (err != 1)
> + return err;
> +
> + if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> + return -EFAULT;
> +
> + op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> + switch (op) {
> + case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> + case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> + case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> + return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -
> EINVAL;
> +
> + case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> + return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -
> EINVAL;
> +
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32
> flags,
> + struct vfio_pci_tph __user
> *arg,
> + size_t argsz)
> +{
> + u32 op;
> + struct vfio_pci_tph tph;
> + void __user *uinfo;
> + size_t infosz;
> + struct vfio_pci_core_device *vdev =
> + container_of(device, struct vfio_pci_core_device,
> vdev);
> + int err = vfio_pci_feature_tph_prepare(arg, argsz, flags,
> &tph);
> +
> + if (err)
> + return err;
> +
> + op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> + switch (op) {
> + case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> + return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> + case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> + return vfio_pci_feature_tph_disable(vdev);
> +
> + case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> + case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> + uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph,
> info);
> + infosz = argsz - sizeof(struct vfio_pci_tph);
> + return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo,
> infosz);
> +
> + default:
> + return -EINVAL;
> + }
> +}
> +
> int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32
> flags,
> void __user *arg, size_t argsz)
> {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct
> vfio_device *device, u32 flags,
> return vfio_pci_core_pm_exit(device, flags, arg,
> argsz);
> case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> return vfio_pci_core_feature_token(device, flags,
> arg, argsz);
> + case VFIO_DEVICE_FEATURE_PCI_TPH:
> + return vfio_pci_core_feature_tph(device, flags,
> + arg, argsz);
> default:
> return -ENOTTY;
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
> };
> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set
> steering tags
> + * on the device. Data provided when setting this feature is a __u32
> with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH
> in
> + * no-steering-tag, interrupt-vector, or device-specific mode when
> feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE
> are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at
> an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used
> and device
> + * capabilities. The caller can set multiple steering tags by
> passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory
> type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object
> is stored in
> + * err, with the steering-tag set on the device and the ph_ignore
> status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the
> values
> + * stored in the st and ph_ignore fields should be considered
> invalid.
> + *
> + * Upon VFIO_DEVICE_FEATURE_GET, return steering tags to the
> caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the
> caller.
> + * The return values per vfio_pci_tph_info object are stored in the
> st,
> + * ph_ignore, and err fields.
> + */
> +struct vfio_pci_tph_info {
> + /* in */
> + __u32 cpu_id;
> + __u32 cache_level;
> + __u8 flags;
> +#define VFIO_TPH_MEM_TYPE_MASK 0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT 0
> +#define VFIO_TPH_MEM_TYPE_VMEM 0 /* Request volatile
> memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM 1 /* Request
> persistent memory ST */
> +
> +#define VFIO_TPH_HINT_MASK 0x3
> +#define VFIO_TPH_HINT_SHIFT 1
> +#define VFIO_TPH_HINT_BIDIR 0
> +#define VFIO_TPH_HINT_REQSTR (1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET (2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO (3 << VFIO_TPH_HINT_SHIFT)
> + __u16 index; /* MSI-X/ST-table index to
> set ST */
> + /* out */
> + __u16 st; /* Steering-Tag */
> + __u8 ph_ignore; /* Processing hint was
> ignored by */
> + __s32 err; /* Error on getting/setting
> Steering-Tag*/
> +};
> +
> +struct vfio_pci_tph {
> + __u32 argsz; /* Size of vfio_pci_tph and
> info[] */
> + __u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK 0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT 3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE 0 /* Enable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE 1 /* Disable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST 2 /* Get
> steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST 4 /* Set
> steering-rags */
> +
> +#define VFIO_TPH_ST_MODE_MASK (0x3 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define VFIO_TPH_ST_NS_MODE (0 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define VFIO_TPH_ST_IV_MODE (1 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define VFIO_TPH_ST_DS_MODE (2 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> + __u32 count; /* Number of entries
> in info[] */
> + struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX 64 /* Max entries
> allowed in info[] */
> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
Powered by blists - more mailing lists