lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200402165954.48d941ee@w520.home>
Date:   Thu, 2 Apr 2020 16:59:54 -0600
From:   Alex Williamson <alex.williamson@...hat.com>
To:     "Liu, Yi L" <yi.l.liu@...el.com>
Cc:     eric.auger@...hat.com, kevin.tian@...el.com,
        jacob.jun.pan@...ux.intel.com, joro@...tes.org,
        ashok.raj@...el.com, jun.j.tian@...el.com, yi.y.sun@...el.com,
        jean-philippe@...aro.org, peterx@...hat.com,
        iommu@...ts.linux-foundation.org, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org, hao.wu@...el.com
Subject: Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs

On Sun, 22 Mar 2020 05:33:14 -0700
"Liu, Yi L" <yi.l.liu@...el.com> wrote:

> From: Liu Yi L <yi.l.liu@...el.com>
> 
> Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the
> PF PASID configuration is shared by its VFs.  VFs must not implement their
> own PASID Capability.
> 
> Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If
> the PF implements PRI, it is shared by the VFs.
> 
> On bare metal, it has been fixed by below efforts.
> to PASID/PRI are
> https://lkml.org/lkml/2019/9/5/996
> https://lkml.org/lkml/2019/9/5/995
> 
> This patch adds emulated PASID/PRI capabilities for VFs when assigned to
> VMs via vfio-pci driver. This is required for enabling vSVA on pass-through
> VFs as VFs have no PASID/PRI capability structure in its configure space.
> 
> Cc: Kevin Tian <kevin.tian@...el.com>
> CC: Jacob Pan <jacob.jun.pan@...ux.intel.com>
> Cc: Alex Williamson <alex.williamson@...hat.com>
> Cc: Eric Auger <eric.auger@...hat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@...aro.org>
> Signed-off-by: Liu Yi L <yi.l.liu@...el.com>
> ---
>  drivers/vfio/pci/vfio_pci_config.c | 325 ++++++++++++++++++++++++++++++++++++-
>  1 file changed, 323 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 4b9af99..84b4ea0 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
>  	return 0;
>  }
>  
> +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev,
> +					int offset, uint8_t *data, int size)
> +{
> +	int ret = 0, data_offset = 0;
> +
> +	while (size) {
> +		int filled;
> +
> +		if (size >= 4 && !(offset % 4)) {
> +			__le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
> +			u32 dword;
> +
> +			memcpy(&dword, data + data_offset, 4);
> +			*dwordp = cpu_to_le32(dword);

Why wouldn't we do:

*dwordp = cpu_to_le32(*(u32 *)(data + data_offset));

or better yet, increment data on each iteration for:

*dwordp = cpu_to_le32(*(u32 *)data);

vfio_fill_vconfig_bytes() does almost this same thing, getting the data
from config space rather than a buffer, so please figure out how to
avoid duplicating the logic.

> +			filled = 4;
> +		} else if (size >= 2 && !(offset % 2)) {
> +			__le16 *wordp = (__le16 *)&vdev->vconfig[offset];
> +			u16 word;
> +
> +			memcpy(&word, data + data_offset, 2);
> +			*wordp = cpu_to_le16(word);
> +			filled = 2;
> +		} else {
> +			u8 *byte = &vdev->vconfig[offset];
> +
> +			memcpy(byte, data + data_offset, 1);

This one is really silly.

vdev->vconfig[offset] = *data;

> +			filled = 1;
> +		}
> +
> +		offset += filled;
> +		data_offset += filled;
> +		size -= filled;
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_pci_get_ecap_content(struct pci_dev *pdev,
> +					int cap, int cap_len, u8 *content)
> +{
> +	int pos, offset, len = cap_len, ret = 0;
> +
> +	pos = pci_find_ext_capability(pdev, cap);
> +	if (!pos)
> +		return -EINVAL;
> +
> +	offset = 0;
> +	while (len) {
> +		int fetched;
> +
> +		if (len >= 4 && !(pos % 4)) {
> +			u32 *dwordp = (u32 *) (content + offset);
> +			u32 dword;
> +			__le32 *dwptr = (__le32 *) &dword;
> +
> +			ret = pci_read_config_dword(pdev, pos, &dword);
> +			if (ret)
> +				return ret;
> +			*dwordp = le32_to_cpu(*dwptr);

WHAT???  pci_read_config_dword() returns cpu endian data!

> +			fetched = 4;
> +		} else if (len >= 2 && !(pos % 2)) {
> +			u16 *wordp = (u16 *) (content + offset);
> +			u16 word;
> +			__le16 *wptr = (__le16 *) &word;
> +
> +			ret = pci_read_config_word(pdev, pos, &word);
> +			if (ret)
> +				return ret;
> +			*wordp = le16_to_cpu(*wptr);
> +			fetched = 2;
> +		} else {
> +			u8 *byte = (u8 *) (content + offset);
> +
> +			ret = pci_read_config_byte(pdev, pos, byte);
> +			if (ret)
> +				return ret;
> +			fetched = 1;
> +		}
> +
> +		pos += fetched;
> +		offset += fetched;
> +		len -= fetched;
> +	}
> +
> +	return ret;
> +}
> +
> +struct vfio_pci_pasid_cap_data {
> +	u32 id:16;
> +	u32 version:4;
> +	u32 next:12;
> +	union {
> +		u16 cap_reg_val;
> +		struct {
> +			u16 rsv1:1;
> +			u16 execs:1;
> +			u16 prvs:1;
> +			u16 rsv2:5;
> +			u16 pasid_bits:5;
> +			u16 rsv3:3;
> +		};
> +	} cap_reg;
> +	union {
> +		u16 control_reg_val;
> +		struct {
> +			u16 paside:1;
> +			u16 exece:1;
> +			u16 prve:1;
> +			u16 rsv4:13;
> +		};
> +	} control_reg;
> +};
> +
> +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev,
> +				    struct pci_dev *pdev,
> +				    u16 epos, u16 *next, __le32 **prevp)
> +{
> +	u8 *map = vdev->pci_config_map;
> +	int ecap = PCI_EXT_CAP_ID_PASID;
> +	int len = pci_ext_cap_length[ecap];
> +	struct vfio_pci_pasid_cap_data pasid_cap;
> +	struct vfio_pci_pasid_cap_data vpasid_cap;
> +	int ret;
> +
> +	/*
> +	 * If no cap filled in this function, should make sure the next
> +	 * pointer points to current epos.
> +	 */
> +	*next = epos;
> +
> +	if (!len) {
> +		pr_info("%s: VF %s hiding PASID capability\n",
> +				__func__, dev_name(&vdev->pdev->dev));
> +		ret = 0;
> +		goto out;
> +	}

No!  Why?  This is dead code.

> +
> +	/* Add PASID capability*/
> +	ret = vfio_pci_get_ecap_content(pdev, ecap,
> +					len, (u8 *)&pasid_cap);


Why wouldn't we just use vfio_fill_config_bytes() rather than this
ridiculous approach of coping it to a buffer, modifying it and copying
it out?

> +	if (ret)
> +		goto out;
> +
> +	if (!pasid_cap.control_reg.paside) {
> +		pr_debug("%s: its PF's PASID capability is not enabled\n",
> +			dev_name(&vdev->pdev->dev));
> +		ret = 0;
> +		goto out;
> +	}

What happens if the PF's PASID gets disabled while we're using it??  

> +
> +	memcpy(&vpasid_cap, &pasid_cap, len);
> +
> +	vpasid_cap.id = 0x18;

What is going on here?

#define PCI_EXT_CAP_ID_LTR      0x18    /* Latency Tolerance Reporting */

> +	vpasid_cap.next = 0;
> +	/* clear the control reg for guest */
> +	memset(&vpasid_cap.control_reg, 0x0,
> +			sizeof(vpasid_cap.control_reg));

So we zero a couple registers and that's why we need a structure to
define the entire capability and this crazy copy to a cpu endian
buffer.  No.

> +
> +	memset(map + epos, vpasid_cap.id, len);

See below.

> +	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
> +					(u8 *)&vpasid_cap, len);
> +	if (!ret) {
> +		/*
> +		 * Successfully filled in PASID cap, update
> +		 * the next offset in previous cap header,
> +		 * and also update caller about the offset
> +		 * of next cap if any.
> +		 */
> +		u32 val = epos;
> +		**prevp &= cpu_to_le32(~(0xffcU << 20));
> +		**prevp |= cpu_to_le32(val << 20);
> +		*prevp = (__le32 *)&vdev->vconfig[epos];
> +		*next = epos + len;

Could we make this any more complicated?

> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +struct vfio_pci_pri_cap_data {
> +	u32 id:16;
> +	u32 version:4;
> +	u32 next:12;
> +	union {
> +		u16 control_reg_val;
> +		struct {
> +			u16 enable:1;
> +			u16 reset:1;
> +			u16 rsv1:14;
> +		};
> +	} control_reg;
> +	union {
> +		u16 status_reg_val;
> +		struct {
> +			u16 rf:1;
> +			u16 uprgi:1;
> +			u16 rsv2:6;
> +			u16 stop:1;
> +			u16 rsv3:6;
> +			u16 pasid_required:1;
> +		};
> +	} status_reg;
> +	u32 prq_capacity;
> +	u32 prq_quota;
> +};
> +
> +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev,
> +				    struct pci_dev *pdev,
> +				    u16 epos, u16 *next, __le32 **prevp)
> +{
> +	u8 *map = vdev->pci_config_map;
> +	int ecap = PCI_EXT_CAP_ID_PRI;
> +	int len = pci_ext_cap_length[ecap];
> +	struct vfio_pci_pri_cap_data pri_cap;
> +	struct vfio_pci_pri_cap_data vpri_cap;
> +	int ret;
> +
> +	/*
> +	 * If no cap filled in this function, should make sure the next
> +	 * pointer points to current epos.
> +	 */
> +	*next = epos;
> +
> +	if (!len) {
> +		pr_info("%s: VF %s hiding PRI capability\n",
> +				__func__, dev_name(&vdev->pdev->dev));
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	/* Add PASID capability*/
> +	ret = vfio_pci_get_ecap_content(pdev, ecap,
> +					len, (u8 *)&pri_cap);
> +	if (ret)
> +		goto out;
> +
> +	if (!pri_cap.control_reg.enable) {
> +		pr_debug("%s: its PF's PRI capability is not enabled\n",
> +			dev_name(&vdev->pdev->dev));
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	memcpy(&vpri_cap, &pri_cap, len);
> +
> +	vpri_cap.id = 0x19;

#define PCI_EXT_CAP_ID_SECPCI   0x19    /* Secondary PCIe Capability */

???

> +	vpri_cap.next = 0;
> +	/* clear the control reg for guest */
> +	memset(&vpri_cap.control_reg, 0x0,
> +			sizeof(vpri_cap.control_reg));
> +
> +	memset(map + epos, vpri_cap.id, len);
> +	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
> +					(u8 *)&vpri_cap, len);
> +	if (!ret) {
> +		/*
> +		 * Successfully filled in PASID cap, update
> +		 * the next offset in previous cap header,
> +		 * and also update caller about the offset
> +		 * of next cap if any.
> +		 */
> +		u32 val = epos;
> +		**prevp &= cpu_to_le32(~(0xffcU << 20));
> +		**prevp |= cpu_to_le32(val << 20);
> +		*prevp = (__le32 *)&vdev->vconfig[epos];
> +		*next = epos + len;
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev,
> +			struct pci_dev *pdev, u16 start_epos, __le32 *prev)
> +{
> +	__le32 *__prev = prev;
> +	u16 epos = start_epos, epos_next = start_epos;
> +	int ret = 0;
> +
> +	/* Add PASID capability*/
> +	ret = vfio_pci_add_pasid_cap(vdev, pdev, epos,
> +					&epos_next, &__prev);
> +	if (ret)
> +		return ret;
> +
> +	/* Add PRI capability */
> +	epos = epos_next;
> +	ret = vfio_pci_add_pri_cap(vdev, pdev, epos,
> +				   &epos_next, &__prev);
> +
> +	return ret;
> +}
> +
>  static int vfio_ecap_init(struct vfio_pci_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
>  	u8 *map = vdev->pci_config_map;
> -	u16 epos;
> +	u16 epos, epos_max;
>  	__le32 *prev = NULL;
>  	int loops, ret, ecaps = 0;
>  
> @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
>  		return 0;
>  
>  	epos = PCI_CFG_SPACE_SIZE;
> +	epos_max = PCI_CFG_SPACE_SIZE;
>  
>  	loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
>  
> @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
>  			}
>  		}
>  
> +		if (epos_max <= (epos + len))
> +			epos_max = epos + len;
> +
>  		if (!len) {
>  			pci_info(pdev, "%s: hiding ecap %#x@%#x\n",
>  				 __func__, ecap, epos);
> @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
>  	if (!ecaps)
>  		*(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
>  
> +#ifdef CONFIG_PCI_ATS
> +	if (pdev->is_virtfn) {
> +		struct pci_dev *physfn = pdev->physfn;
> +
> +		ret = vfio_pci_add_emulated_cap_for_vf(vdev,
> +					physfn, epos_max, prev);
> +		if (ret)
> +			pr_info("%s, failed to add special caps for VF %s\n",
> +				__func__, dev_name(&vdev->pdev->dev));
> +	}
> +#endif

I can only imagine that we should place the caps at the same location
they exist on the PF, we don't know what hidden registers might be
hiding in config space.

> +
>  	return 0;
>  }
>  
> @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
>  	return i;
>  }
>  
> +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id)
> +{
> +#ifdef CONFIG_PCI_ATS
> +	return (pdev->is_virtfn &&
> +		(cap_id == PCI_EXT_CAP_ID_PASID ||
> +		 cap_id == PCI_EXT_CAP_ID_PRI));
> +#else
> +	return false;
> +#endif
> +}
> +
>  static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>  				 size_t count, loff_t *ppos, bool iswrite)
>  {
> @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>  	if (cap_id == PCI_CAP_ID_INVALID) {
>  		perm = &unassigned_perms;
>  		cap_start = *ppos;
> -	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT) {
> +	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT ||
> +		   vfio_pci_need_virt_perm(pdev, cap_id)) {

Why not simply fill the map with PCI_CAP_ID_INVALID_VIRT?

>  		perm = &virt_perms;
>  		cap_start = *ppos;
>  	} else {

This is really not close to acceptable as is.  Sorry.  Thanks,

Alex

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ