[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <50BDB074.1000403@ozlabs.ru>
Date: Tue, 04 Dec 2012 19:12:36 +1100
From: Alexey Kardashevskiy <aik@...abs.ru>
To: Alex Williamson <alex.williamson@...hat.com>
CC: Benjamin Herrenschmidt <benh@...nel.crashing.org>,
Paul Mackerras <paulus@...ba.org>,
David Gibson <david@...son.dropbear.id.au>,
linuxppc-dev@...ts.ozlabs.org, linux-kernel@...r.kernel.org,
kvm@...r.kernel.org
Subject: Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform
On 04/12/12 04:35, Alex Williamson wrote:
> On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@...son.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@...abs.ru>
>> ---
>> arch/powerpc/include/asm/iommu.h | 9 ++
>> arch/powerpc/kernel/iommu.c | 186 ++++++++++++++++++++++++++++++++++
>> arch/powerpc/platforms/powernv/pci.c | 135 ++++++++++++++++++++++++
>> drivers/iommu/Kconfig | 8 ++
>> 4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>> struct iommu_pool large_pool;
>> struct iommu_pool pools[IOMMU_NR_POOLS];
>> unsigned long *it_map; /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> + struct iommu_group *it_group;
>> +#endif
>> };
>>
>> struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>> }
>> #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> + unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> + uint64_t tce, enum dma_data_direction direction,
>> + unsigned long pages);
>> +
>> #endif /* __KERNEL__ */
>> #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..2738aa4 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>> #include <asm/kdump.h>
>> #include <asm/fadump.h>
>> #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>> #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>> free_pages((unsigned long)vaddr, get_order(size));
>> }
>> }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> + int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> + /* Aligns TCE entry number to system page boundary */
>> + entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> + /* Count used 4K pages */
>> + while (nbits--)
>> + ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>> +
>> + return ret;
>> +}
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> + /* Flush/invalidate TLB caches if necessary */
>> + if (ppc_md.tce_flush)
>> + ppc_md.tce_flush(tbl);
>> +
>> + /* Make sure updates are seen by hardware */
>> + mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> + unsigned long pages)
>> +{
>> + int i, retpages = 0;
>> + unsigned long oldtce, oldweight;
>> + struct page *page;
>> +
>> + for (i = 0; i < pages; ++i) {
>> + oldtce = ppc_md.tce_get(tbl, entry + i);
>> + ppc_md.tce_free(tbl, entry + i, 1);
>> +
>> + oldweight = syspage_weight(tbl->it_map, entry);
>> + __clear_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> + continue;
>
> Could this happen earlier, above syspage_weight() and __clear_bit()?
Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.
>> +
>> + page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> + WARN_ON(!page);
>> + if (!page)
>> + continue;
>> +
>> + if (oldtce & TCE_PCI_WRITE)
>> + SetPageDirty(page);
>> +
>> + put_page(page);
>> +
>> + /* That was the last IOMMU page within the system page */
>> + if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
>> + ++retpages;
>
> If you used __test_and_clear_bit() above I think you could avoid this
> 2nd call to syspage_weight. A minor optimization though.
>
>> + }
>> +
>> + return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + / of released system pages
>> + */
>
> Something bad happened to your comments here.
>
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> + unsigned long pages)
>> +{
>> + int ret;
>> + struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> + spin_lock(&(pool->lock));
>> + ret = clear_tces_nolock(tbl, entry, pages);
>> + tce_flush(tbl);
>> + spin_unlock(&(pool->lock));
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> + uint64_t tce, enum dma_data_direction direction)
>> +{
>> + int ret;
>> + struct page *page = NULL;
>> + unsigned long kva, offset, oldweight;
>> +
>> + /* Map new TCE */
>> + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>
> Maybe the compiler will figure this out, but isn't this the same as tce
> & (IOMMU_PAGE_MASK & PAGE_MASK)?
it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it
is simpler and I doubt that it is faster enough to notice it anyhow :)
>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> + direction != DMA_TO_DEVICE, &page);
>> + if (ret < 1) {
>
> Probably (ret != 1) here or else we never get to your >1 case below.
>
>> + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> + tce, entry << IOMMU_PAGE_SHIFT, ret);
>
> Use pr_err
>
>> + if (!ret || (ret > 1))
>
> Then (ret >= 0) here. Or return (ret >= 0) ? -EFAULT : ret
>
>> + ret = -EFAULT;
>> + return ret;
>> + }
>
> You're missing the code from x86 that handles mapping mmap'd ranges.
> This is intended to allow peer-to-peer DMA between devices. Is that
> intentional?
I am not following you here. What code exactly are talking about? We do not
track ranges at all and I do not see how it helps with p2p dma.
>> +
>> + kva = (unsigned long) page_address(page);
>> + kva += offset;
>> +
>> + /* tce_build receives a virtual address */
>> + entry += tbl->it_offset; /* Offset into real TCE table */
>
> Here's what makes me call the entry "relative" rather than zero-based.
This is the bug actually, I overlooked it and I removed it now. Thanks for
being so picky :)
> The iova is relative to the start of dma32_window_start, ie. if the
> window starts at bus address 512MB and I want to create a translation at
> bus address 512MB, I pass in an iova of 0, right? The above adds the
> window offset. So you've removed dma64 window, but we really need to
> define iova better.
>> + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> + /* tce_build() only returns non-zero for transient errors */
>> + if (unlikely(ret)) {
>> + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> + tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>
> Use pr_err
>
>> + put_page(page);
>> + return -EIO;
>> + }
>> +
>> + /* Calculate if new system page has been locked */
>> + oldweight = syspage_weight(tbl->it_map, entry);
>> + __set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> + return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> + uint64_t tce, enum dma_data_direction direction,
>> + unsigned long pages)
>> +{
>> + int i, ret = 0, retpages = 0;
>> + struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> + BUG_ON(direction == DMA_NONE);
>
> This doesn't seem BUG worthy, -EINVAL? We can't assume tce_iommu_ioctl
> will always be the only caller of this function.
This is what other function does in this file.
>> +
>> + spin_lock(&(pool->lock));
>> +
>> + /* Check if any is in use */
>> + for (i = 0; i < pages; ++i) {
>> + unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
>> + if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
>> + test_bit(entry + i, tbl->it_map)) {
>> + WARN_ON(test_bit(entry + i, tbl->it_map));
>
> The WARN_ON seems to confirm that these are redundant tests, does that
> imply we don't trust it_map? It would be a lot faster if we could rely
> on it_map exclusively here.
As for me, pretty minor optimization. I'm testing it now to see if I do not
miss bits.
>> + spin_unlock(&(pool->lock));
>> + return -EBUSY;
>> + }
>> + }
>> +
>> + /* Put tces to the table */
>> + for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> + ret = put_tce(tbl, entry + i, tce, direction);
>> + if (ret == 1)
>> + ++retpages;
>> + }
>> +
>> + /*
>> + * If failed, release locked pages, otherwise return the number
>> + * of locked system pages
>> + */
>> + if (ret < 0)
>> + clear_tces_nolock(tbl, entry, i);
>> + else
>> + ret = retpages;
>> +
>> + tce_flush(tbl);
>> + spin_unlock(&(pool->lock));
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 05205cf..21250ef 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -20,6 +20,7 @@
>> #include <linux/irq.h>
>> #include <linux/io.h>
>> #include <linux/msi.h>
>> +#include <linux/iommu.h>
>>
>> #include <asm/sections.h>
>> #include <asm/io.h>
>> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>> ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>> #endif
>> }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * IOMMU groups support required by VFIO
>> + */
>> +static int add_device(struct device *dev)
>> +{
>> + struct iommu_table *tbl;
>> + int ret = 0;
>> +
>> + if (WARN_ON(dev->iommu_group)) {
>> + printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
>> + dev_name(dev),
>> + iommu_group_id(dev->iommu_group));
>
> Use pr_warn
>
>> + return -EBUSY;
>> + }
>> +
>> + tbl = get_iommu_table_base(dev);
>> + if (!tbl) {
>> + pr_debug("tce_vfio: skipping device %s with no tbl\n",
>> + dev_name(dev));
>> + return 0;
>> + }
>> +
>> + pr_debug("tce_vfio: adding %s to iommu group %d\n",
>> + dev_name(dev), iommu_group_id(tbl->it_group));
>> +
>> + ret = iommu_group_add_device(tbl->it_group, dev);
>> + if (ret < 0)
>> + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>> + dev_name(dev), ret);
>
> Use pr_err
>
>> +
>> + return ret;
>> +}
>> +
>> +static void del_device(struct device *dev)
>> +{
>> + iommu_group_remove_device(dev);
>> +}
>> +
>> +static int iommu_bus_notifier(struct notifier_block *nb,
>> + unsigned long action, void *data)
>> +{
>> + struct device *dev = data;
>> +
>> + switch (action) {
>> + case BUS_NOTIFY_ADD_DEVICE:
>> + return add_device(dev);
>> + case BUS_NOTIFY_DEL_DEVICE:
>> + del_device(dev);
>> + return 0;
>> + default:
>> + return 0;
>> + }
>> +}
>> +
>> +static struct notifier_block tce_iommu_bus_nb = {
>> + .notifier_call = iommu_bus_notifier,
>> +};
>> +
>> +static void group_release(void *iommu_data)
>> +{
>> + struct iommu_table *tbl = iommu_data;
>> + tbl->it_group = NULL;
>> +}
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> + struct pci_dev *pdev = NULL;
>> + struct iommu_table *tbl;
>> + struct iommu_group *grp;
>> +
>> + /* Allocate and initialize IOMMU groups */
>> + for_each_pci_dev(pdev) {
>> + tbl = get_iommu_table_base(&pdev->dev);
>> + if (!tbl)
>> + continue;
>> +
>> + /* Skip already initialized */
>> + if (tbl->it_group)
>> + continue;
>> +
>> + grp = iommu_group_alloc();
>> + if (IS_ERR(grp)) {
>> + printk(KERN_INFO "tce_vfio: cannot create "
>> + "new IOMMU group, ret=%ld\n",
>> + PTR_ERR(grp));
>
> Use pr_info
>
>> + return PTR_ERR(grp);
>> + }
>> + tbl->it_group = grp;
>> + iommu_group_set_iommudata(grp, tbl, group_release);
>> + }
>> +
>> + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> + /* Add PCI devices to VFIO groups */
>> + for_each_pci_dev(pdev)
>> + add_device(&pdev->dev);
>> +
>> + return 0;
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> + struct pci_dev *pdev = NULL;
>> + struct iommu_table *tbl;
>> + struct iommu_group *grp = NULL;
>> +
>> + bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> + /* Delete PCI devices from VFIO groups */
>> + for_each_pci_dev(pdev)
>> + del_device(&pdev->dev);
>> +
>> + /* Release VFIO groups */
>> + for_each_pci_dev(pdev) {
>> + tbl = get_iommu_table_base(&pdev->dev);
>> + if (!tbl)
>> + continue;
>> + grp = tbl->it_group;
>> +
>> + /* Skip (already) uninitialized */
>> + if (!grp)
>> + continue;
>> +
>> + /* Do actual release, group_release() is expected to work */
>> + iommu_group_put(grp);
>> + BUG_ON(tbl->it_group);
>> + }
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 9f69b56..29d11dc 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>>
>> Say N unless you need kernel log message for IOMMU debugging
>>
>> +config SPAPR_TCE_IOMMU
>> + bool "sPAPR TCE IOMMU Support"
>> + depends on PPC_POWERNV
>> + select IOMMU_API
>> + help
>> + Enables bits of IOMMU API required by VFIO. The iommu_ops is
>> + still not implemented.
>> +
>> endif # IOMMU_SUPPORT
>
> Thanks,
>
> Alex
>
--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists