lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed, 19 Jun 2024 13:02:11 -0300
From: Jason Gunthorpe <jgg@...pe.ca>
To: Zong Li <zong.li@...ive.com>
Cc: joro@...tes.org, will@...nel.org, robin.murphy@....com,
	tjeznach@...osinc.com, paul.walmsley@...ive.com, palmer@...belt.com,
	aou@...s.berkeley.edu, kevin.tian@...el.com,
	linux-kernel@...r.kernel.org, iommu@...ts.linux.dev,
	linux-riscv@...ts.infradead.org
Subject: Re: [RFC PATCH v2 07/10] iommu/riscv: support nested iommu for
 creating domains owned by userspace

On Fri, Jun 14, 2024 at 10:21:53PM +0800, Zong Li wrote:
> This patch implements .domain_alloc_user operation for creating domains
> owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> domain for second stage, s1 domain will be the first stage.
> 
> Don't remove IOMMU private data of dev in blocked domain, because it
> holds the user data of device, which is used when attaching device into
> s1 domain.
> 
> Signed-off-by: Zong Li <zong.li@...ive.com>
> ---
>  drivers/iommu/riscv/iommu.c  | 236 ++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/iommufd.h |  17 +++
>  2 files changed, 252 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 2130106e421f..410b236e9b24 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -846,6 +846,8 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>  
>  /* This struct contains protection domain specific IOMMU driver data. */
>  struct riscv_iommu_domain {
> +	struct riscv_iommu_domain *s2;
> +	struct riscv_iommu_device *iommu;

IMHO you should create a riscv_iommu_domain_nested and not put these
here, like ARM did.

The kernel can't change the nested domain so it can't recieve and
distribute invalidations.

> +/**
> + * Nested IOMMU operations
> + */
> +
> +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +
> +	/*
> +	 * Add bond to the new domain's list, but don't unlink in current domain.
> +	 * We need to flush entries in stage-2 page table by iterating the list.
> +	 */
> +	if (riscv_iommu_bond_link(riscv_domain, dev))
> +		return -ENOMEM;
> +
> +	riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
> +	info->dc_user.ta |= RISCV_IOMMU_PC_TA_V;

Seems odd??

> +	riscv_iommu_iodir_update(iommu, dev, &info->dc_user);

This will be need some updating to allow changes that don't toggle
V=0, like in arm.

> +	info->domain = riscv_domain;
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> +{
> +	struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +	struct riscv_iommu_bond *bond;
> +
> +	/* Unlink bond in s2 domain, because we link bond both on s1 and s2 domain */
> +	list_for_each_entry_rcu(bond, &riscv_domain->s2->bonds, list)
> +		riscv_iommu_bond_unlink(riscv_domain->s2, bond->dev);
> +
> +	if ((int)riscv_domain->pscid > 0)
> +		ida_free(&riscv_iommu_pscids, riscv_domain->pscid);
> +
> +	kfree(riscv_domain);
> +}
> +
> +static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
> +	.attach_dev	= riscv_iommu_attach_dev_nested,
> +	.free		= riscv_iommu_domain_free_nested,
> +};
> +
> +static int
> +riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +	struct riscv_iommu_dc dc;
> +	struct riscv_iommu_fq_record event;
> +	u64 dc_len = sizeof(struct riscv_iommu_dc) >>
> +		     (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT));
> +	u64 event_len = sizeof(struct riscv_iommu_fq_record);
> +	void __user *event_user = NULL;
> +
> +	for (int i = 0; i < fwspec->num_ids; i++) {
> +		event.hdr =
> +			FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) |
> +			FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]);
> +
> +		/* Sanity check DC of stage-1 from user data */
> +		if (!user_arg->out_event_uptr || user_arg->event_len != event_len)
> +			return -EINVAL;

This is not extensible, see below about just inlining it.

> +		event_user = u64_to_user_ptr(user_arg->out_event_uptr);
> +
> +		if (!user_arg->dc_uptr || user_arg->dc_len != dc_len)
> +			return -EINVAL;
> +
> +		if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len))
> +			return -EFAULT;
> +
> +		if (!(dc.tc & RISCV_IOMMU_DDTE_V)) {
> +			dev_dbg(dev, "Invalid DDT from user data\n");
> +			if (copy_to_user(event_user, &event, event_len))
> +				return -EFAULT;
> +		}

On ARM we are going to support non-valid STEs. It should put the the
translation into blocking and ideally emulate translation failure
events.

> +
> +		if (!dc.fsc || dc.iohgatp) {
> +			dev_dbg(dev, "Wrong page table from user data\n");
> +			if (copy_to_user(event_user, &event, event_len))
> +				return -EFAULT;
> +		}
> +
> +		/* Save DC of stage-1 from user data */
> +		memcpy(&info->dc_user,
> +		       riscv_iommu_get_dc(iommu, fwspec->ids[i]),

This does not seem right, the fwspec shouldn't be part of domain
allocation, even arguably for nesting. The nesting domain should
represent the user_dc only. Any customization of kernel controlled bits
should be done during attach only, nor do I really understand why this
is looping over all the fwspecs but only memcpying the last one..

> +		       sizeof(struct riscv_iommu_dc));
> +		info->dc_user.fsc = dc.fsc;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_nested(struct device *dev,
> +				struct iommu_domain *parent,
> +				const struct iommu_user_data *user_data)
> +{
> +	struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> +	struct riscv_iommu_domain *s1_domain;
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +	struct iommu_hwpt_riscv_iommu arg;
> +	int ret, va_bits;
> +
> +	if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> +		return ERR_PTR(-EINVAL);
> +
> +	ret = iommu_copy_struct_from_user(&arg,
> +					  user_data,
> +					  IOMMU_HWPT_DATA_RISCV_IOMMU,
> +					  out_event_uptr);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> +	if (!s1_domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	spin_lock_init(&s1_domain->lock);
> +	INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> +
> +	s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> +					   RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> +	if (s1_domain->pscid < 0) {
> +		iommu_free_page(s1_domain->pgd_root);
> +		kfree(s1_domain);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* Get device context of stage-1 from user*/
> +	ret = riscv_iommu_get_dc_user(dev, &arg);
> +	if (ret) {
> +		kfree(s1_domain);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	if (!iommu) {
> +		va_bits = VA_BITS;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
> +		va_bits = 57;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
> +		va_bits = 48;
> +	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
> +		va_bits = 39;
> +	} else {
> +		dev_err(dev, "cannot find supported page table mode\n");
> +		return ERR_PTR(-ENODEV);
> +	}
> +
> +	/*
> +	 * The ops->domain_alloc_user could be directly called by the iommufd core,
> +	 * instead of iommu core. So, this function need to set the default value of
> +	 * following data member:
> +	 *  - domain->pgsize_bitmap
> +	 *  - domain->geometry
> +	 *  - domain->type
> +	 *  - domain->ops
> +	 */
> +	s1_domain->s2 = s2_domain;
> +	s1_domain->iommu = iommu;
> +	s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> +	s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
> +	s1_domain->domain.pgsize_bitmap = SZ_4K;
> +	s1_domain->domain.geometry.aperture_start = 0;
> +	s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> +	s1_domain->domain.geometry.force_aperture = true;

There is no geometry or page size of nesting domains.

> +
> +	return &s1_domain->domain;
> +}
> +
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct iommu_domain *domain;
> +	struct riscv_iommu_domain *riscv_domain;
> +
> +	/* Allocate stage-1 domain if it has stage-2 parent domain */
> +	if (parent)
> +		return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> +	if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (user_data)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* domain_alloc_user op needs to be fully initialized */
> +	domain = iommu_domain_alloc(dev->bus);

Please organize your driver to avoid calling this core function
through a pointer and return the correct type from the start so you
don't have to cast.

> +	if (!domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * We assume that nest-parent or g-stage only will come here
> +	 * TODO: Shadow page table doesn't be supported now.
> +	 *       We currently can't distinguish g-stage and shadow
> +	 *       page table here. Shadow page table shouldn't be
> +	 *       put at stage-2.
> +	 */
> +	riscv_domain = iommu_domain_to_riscv(domain);
> +
> +	/* pgd_root may be allocated in .domain_alloc_paging */
> +	if (riscv_domain->pgd_root)
> +		iommu_free_page(riscv_domain->pgd_root);

And don't do stuff like this, if domain_alloc didn't do the right
stuff then reorganize it so that it does. Most likely pass in a flag
that you are allocating the nest so it can setup properly if it is
only a small change like this.

> +/**
> + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> + *                                 info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> + * @dc_len: Length of device context
> + * @dc_uptr: User pointer to the address of device context
> + * @event_len: Length of an event record
> + * @out_event_uptr: User pointer to the address of event record
> + */
> +struct iommu_hwpt_riscv_iommu {
> +	__aligned_u64 dc_len;
> +	__aligned_u64 dc_uptr;

Do we really want this to be a pointer? ARM just inlined it in the
struct, why not do that?

> +	__aligned_u64 event_len;
> +	__aligned_u64 out_event_uptr;
> +};

Similar here too, why not just inline the response memory?

Jason

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ