[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <50B6DBA6.7020102@ozlabs.ru>
Date: Thu, 29 Nov 2012 14:51:02 +1100
From: Alexey Kardashevskiy <aik@...abs.ru>
To: Alex Williamson <alex.williamson@...hat.com>
CC: Benjamin Herrenschmidt <benh@...nel.crashing.org>,
Paul Mackerras <paulus@...ba.org>,
linuxppc-dev@...ts.ozlabs.org, linux-kernel@...r.kernel.org,
kvm@...r.kernel.org, David Gibson <david@...son.dropbear.id.au>
Subject: Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
On 29/11/12 08:01, Alex Williamson wrote:
> On Wed, 2012-11-28 at 18:21 +1100, Alexey Kardashevskiy wrote:
>> VFIO implements platform independent stuff such as
>> a PCI driver, BAR access (via read/write on a file descriptor
>> or direct mapping when possible) and IRQ signaling.
>>
>> The platform dependent part includes IOMMU initialization
>> and handling. This patch implements an IOMMU driver for VFIO
>> which does mapping/unmapping pages for the guest IO and
>> provides information about DMA window (required by a POWERPC
>> guest).
>>
>> The counterpart in QEMU is required to support this functionality.
>>
>> Cc: David Gibson <david@...son.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@...abs.ru>
>> ---
>> drivers/vfio/Kconfig | 6 +
>> drivers/vfio/Makefile | 1 +
>> drivers/vfio/vfio_iommu_spapr_tce.c | 332 +++++++++++++++++++++++++++++++++++
>> include/linux/vfio.h | 33 ++++
>> 4 files changed, 372 insertions(+)
>> create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
>>
>> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
>> index 7cd5dec..b464687 100644
>> --- a/drivers/vfio/Kconfig
>> +++ b/drivers/vfio/Kconfig
>> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>> depends on VFIO
>> default n
>>
>> +config VFIO_IOMMU_SPAPR_TCE
>> + tristate
>> + depends on VFIO && SPAPR_TCE_IOMMU
>> + default n
>> +
>> menuconfig VFIO
>> tristate "VFIO Non-Privileged userspace driver framework"
>> depends on IOMMU_API
>> select VFIO_IOMMU_TYPE1 if X86
>> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>> help
>> VFIO provides a framework for secure userspace device drivers.
>> See Documentation/vfio.txt for more details.
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 2398d4a..72bfabc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,3 +1,4 @@
>> obj-$(CONFIG_VFIO) += vfio.o
>> obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>> obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> new file mode 100644
>> index 0000000..b98770e
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -0,0 +1,332 @@
>> +/*
>> + * VFIO: IOMMU DMA mapping support for TCE on POWER
>> + *
>> + * Copyright (C) 2012 IBM Corp. All rights reserved.
>> + * Author: Alexey Kardashevskiy <aik@...abs.ru>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * Derived from original vfio_iommu_type1.c:
>> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
>> + * Author: Alex Williamson <alex.williamson@...hat.com>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/err.h>
>> +#include <linux/vfio.h>
>> +#include <asm/iommu.h>
>> +
>> +#define DRIVER_VERSION "0.1"
>> +#define DRIVER_AUTHOR "aik@...abs.ru"
>> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group);
>> +
>> +/*
>> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>> + */
>> +
>> +/*
>> + * This code handles mapping and unmapping of user data buffers
>> + * into DMA'ble space using the IOMMU
>> + */
>> +
>> +#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
>> +
>> +struct vwork {
>> + struct mm_struct *mm;
>> + long npage;
>> + struct work_struct work;
>> +};
>> +
>> +/* delayed decrement/increment for locked_vm */
>> +static void lock_acct_bg(struct work_struct *work)
>> +{
>> + struct vwork *vwork = container_of(work, struct vwork, work);
>> + struct mm_struct *mm;
>> +
>> + mm = vwork->mm;
>> + down_write(&mm->mmap_sem);
>> + mm->locked_vm += vwork->npage;
>> + up_write(&mm->mmap_sem);
>> + mmput(mm);
>> + kfree(vwork);
>> +}
>> +
>> +static void lock_acct(long npage)
>> +{
>> + struct vwork *vwork;
>> + struct mm_struct *mm;
>> +
>> + if (!current->mm)
>> + return; /* process exited */
>> +
>> + if (down_write_trylock(¤t->mm->mmap_sem)) {
>> + current->mm->locked_vm += npage;
>> + up_write(¤t->mm->mmap_sem);
>> + return;
>> + }
>> +
>> + /*
>> + * Couldn't get mmap_sem lock, so must setup to update
>> + * mm->locked_vm later. If locked_vm were atomic, we
>> + * wouldn't need this silliness
>> + */
>> + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>> + if (!vwork)
>> + return;
>> + mm = get_task_mm(current);
>> + if (!mm) {
>> + kfree(vwork);
>> + return;
>> + }
>> + INIT_WORK(&vwork->work, lock_acct_bg);
>> + vwork->mm = mm;
>> + vwork->npage = npage;
>> + schedule_work(&vwork->work);
>> +}
>
> This looks familiar, should we split it out to a common file instead of
> duplicating it?
It is simple cut-n-paste from type1 driver :)
Moving it to a separate file is up to you but it is quite small piece of
code to move it somewhere, and I have not fixed rlimit handling yet, so
wait a bit.
>> +
>> +/*
>> + * The container descriptor supports only a single group per container.
>> + * Required by the API as the container is not supplied with the IOMMU group
>> + * at the moment of initialization.
>> + */
>> +struct tce_container {
>> + struct mutex lock;
>> + struct iommu_table *tbl;
>> +};
>> +
>> +static void *tce_iommu_open(unsigned long arg)
>> +{
>> + struct tce_container *container;
>> +
>> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
>> + pr_err("tce_vfio: Wrong IOMMU type\n");
>> + return ERR_PTR(-EINVAL);
>> + }
>> +
>> + container = kzalloc(sizeof(*container), GFP_KERNEL);
>> + if (!container)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + mutex_init(&container->lock);
>> +
>> + return container;
>> +}
>> +
>> +static void tce_iommu_release(void *iommu_data)
>> +{
>> + struct tce_container *container = iommu_data;
>> +
>> + WARN_ON(container->tbl && !container->tbl->it_group);
>> + if (container->tbl && container->tbl->it_group)
>> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>> +
>> + mutex_destroy(&container->lock);
>> +
>> + kfree(container);
>> +}
>> +
>> +static long tce_iommu_ioctl(void *iommu_data,
>> + unsigned int cmd, unsigned long arg)
>> +{
>> + struct tce_container *container = iommu_data;
>> + unsigned long minsz;
>> + long ret;
>> +
>> + switch (cmd) {
>> + case VFIO_CHECK_EXTENSION: {
>> + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> + }
>> + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>> + struct vfio_iommu_spapr_tce_info info;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>> + dma64_window_size);
>> +
>> + if (copy_from_user(&info, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (info.argsz < minsz)
>> + return -EINVAL;
>> +
>> + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
>> + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
>> + info.dma64_window_start = 0;
>> + info.dma64_window_size = 0;
>> + info.flags = 0;
>> +
>> + if (copy_to_user((void __user *)arg, &info, minsz))
>> + return -EFAULT;
>> +
>> + return 0;
>> + }
>> + case VFIO_IOMMU_MAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_map param;
>> + struct iommu_table *tbl = container->tbl;
>> + enum dma_data_direction direction;
>> + unsigned long locked, lock_limit;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> + if (copy_from_user(¶m, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> + (param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> + direction = DMA_BIDIRECTIONAL;
>> + else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> + direction = DMA_TO_DEVICE;
>> + else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> + direction = DMA_FROM_DEVICE;
>> + else
>> + return -EINVAL;
>> +
>> + if ((param.size & ~IOMMU_PAGE_MASK) ||
>> + (param.iova & ~IOMMU_PAGE_MASK) ||
>> + (param.vaddr & ~IOMMU_PAGE_MASK))
>> + return -EINVAL;
>> +
>> + /* Account for locked pages */
>> + locked = current->mm->locked_vm +
>> + (param.size >> IOMMU_PAGE_SHIFT);
>> + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> This page accounting doesn't look right. PAGE_SIZE is several orders
> bigger than IOMMU_PAGE_SIZE (right?), but we mix them here, which seems
> like it will over penalize the user. For example, if a user maps 4x4k
> (assume aligned and contiguous) IOMMU pages, isn't that only a single
> pinned system page (assuming >=16k pages).
Oops. My bad. IOMMU_PAGE_SHIFT should be PAGE_SHIFT and should return the
number of system pages.
But we do not track 4K pages so I do not see any easy solution here. Except
fixing iommu_put_tces/iommu_clear_tces (*) to return the number of the very
first 4K IOMMU pages within system 64K pages.
This won't be too accurate but should work, no?
I'll post it as a patch in reply to "vfio powerpc: enabled on powernv
platform".
>> + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>> + rlimit(RLIMIT_MEMLOCK));
>> + return -ENOMEM;
>> + }
>> +
>> + ret = iommu_put_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + param.vaddr, direction,
>> + param.size >> IOMMU_PAGE_SHIFT);
>> + if (ret > 0)
>> + lock_acct(ret);
>> +
>> + return ret;
>> + }
>> + case VFIO_IOMMU_UNMAP_DMA: {
>> + vfio_iommu_spapr_tce_dma_unmap param;
>> + struct iommu_table *tbl = container->tbl;
>> +
>> + if (WARN_ON(!tbl))
>> + return -ENXIO;
>> +
>> + minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
>> +
>> + if (copy_from_user(¶m, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + if ((param.size & ~IOMMU_PAGE_MASK) ||
>> + (param.iova & ~IOMMU_PAGE_MASK))
>> + return -EINVAL;
>> +
>> + ret = iommu_clear_tces(tbl, param.iova >> IOMMU_PAGE_SHIFT,
>> + param.size >> IOMMU_PAGE_SHIFT);
>> + if (ret > 0)
>> + lock_acct(-ret);
>> +
>> + return ret;
>> + }
>> + default:
>> + pr_warn("tce_vfio: unexpected cmd %x\n", cmd);
>> + }
>> +
>> + return -ENOTTY;
>> +}
>> +
>> +static int tce_iommu_attach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> + if (container->tbl) {
>> + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>> + iommu_group_id(container->tbl->it_group),
>> + iommu_group_id(iommu_group));
>> + mutex_unlock(&container->lock);
>> + return -EBUSY;
>> + }
>> +
>> + container->tbl = tbl;
>> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> + mutex_unlock(&container->lock);
>> +
>> + return 0;
>> +}
>> +
>> +static void tce_iommu_detach_group(void *iommu_data,
>> + struct iommu_group *iommu_group)
>> +{
>> + struct tce_container *container = iommu_data;
>> + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>> +
>> + BUG_ON(!tbl);
>> + mutex_lock(&container->lock);
>> + if (tbl != container->tbl) {
>> + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
>> + iommu_group_id(iommu_group),
>> + iommu_group_id(tbl->it_group));
>> + } else {
>> +
>> + pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>> + iommu_group_id(iommu_group), iommu_group);
>> +
>> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
>> + container->tbl = NULL;
>> + }
>> + mutex_unlock(&container->lock);
>> +}
>> +
>> +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
>> + .name = "iommu-vfio-powerpc",
>> + .owner = THIS_MODULE,
>> + .open = tce_iommu_open,
>> + .release = tce_iommu_release,
>> + .ioctl = tce_iommu_ioctl,
>> + .attach_group = tce_iommu_attach_group,
>> + .detach_group = tce_iommu_detach_group,
>> +};
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> + return vfio_register_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> + vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> +
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 0a4f180..820af1e 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
>> /* Extensions */
>>
>> #define VFIO_TYPE1_IOMMU 1
>> +#define VFIO_SPAPR_TCE_IOMMU 2
>>
>> /*
>> * The IOCTL interface is designed for extensibility by embedding the
>> @@ -442,4 +443,36 @@ struct vfio_iommu_type1_dma_unmap {
>>
>> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>> +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>> +
>> +/*
>> + * The SPAPR TCE info struct provides the information about the PCI bus
>> + * address ranges available for DMA, these values are programmed into
>> + * the hardware so the guest has to know that information.
>> + *
>> + * Pages within 32 bit window should be explicitely mapped/unmapped via ioctls.
> ^^^^^^^^^^^
> explicitly
>
>> + * 64 bit window (not supported at the moment for the guest) is supposed to
>> + * be mapped completely to the guest memory so the devices capable of 64bit
>> + * DMA will not have to use map/unmap ioctls.
>> + *
>> + * The IOMMU page size is always 4K.
>> + */
>
> Thanks,
>
> Alex
>
>> +
>> +struct vfio_iommu_spapr_tce_info {
>> + __u32 argsz;
>> + __u32 flags; /* reserved for future use */
>> + __u32 dma32_window_start; /* 32 bit window start (bytes) */
>> + __u32 dma32_window_size; /* 32 bit window size (bytes) */
>> + __u64 dma64_window_start; /* 64 bit window start (bytes) */
>> + __u64 dma64_window_size; /* 64 bit window size (bytes) */
>> +};
>> +
>> +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +/* Reuse type1 map/unmap structs as they are the same at the moment */
>> +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
>> +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
>> +
>> +/* ***************************************************************** */
>> +
>> #endif /* VFIO_H */
>
>
>
--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists