[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100730202724.GA920@andromeda.dapyr.net>
Date: Fri, 30 Jul 2010 16:27:24 -0400
From: Konrad Rzeszutek Wilk <konrad@...nok.org>
To: FUJITA Tomonori <fujita.tomonori@....ntt.co.jp>
Cc: linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org,
ak@...ux.intel.com, konrad.wilk@...cle.com, akataria@...are.com
Subject: Re: [PATCH] swiotlb: enlarge iotlb buffer on demand
On Sat, Jul 31, 2010 at 12:37:54AM +0900, FUJITA Tomonori wrote:
> Note that this isn't for the next merge window. Seems that it works
> but I need more testings and cleanups (and need to fix ia64 code).
>
> =
> From: FUJITA Tomonori <fujita.tomonori@....ntt.co.jp>
> Subject: [PATCH] swiotlb: enlarge iotlb buffer on demand
>
> This enables swiotlb to enlarg iotlb (bounce) buffer on demand.
Neat!
>
> On x86_64, swiotlb is enabled only when more than 4GB memory is
> available. swiotlb uses 64MB memory by default. 64MB is not so
> precious in this case, I suppose.
>
> The problem is that it's likely that x86_64 always needs to enable
> swiotlb due to hotplug memory support. 64MB could be very precious.
>
> swiotlb iotlb buffer is physically continuous (64MB by default). With
> this patch, iotlb buffer doesn't need to be physically continuous. So
> swiotlb can allocate iotlb buffer on demand. Currently, swiotlb
> allocates 256KB at a time.
>
> Signed-off-by: FUJITA Tomonori <fujita.tomonori@....ntt.co.jp>
> ---
> lib/swiotlb.c | 186 ++++++++++++++++++++++++++++++++++++++++++---------------
> 1 files changed, 138 insertions(+), 48 deletions(-)
>
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index a009055..e2c64ab 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -65,11 +65,14 @@ int swiotlb_force;
> * sync_single_*, to see if the memory was in fact allocated by this
> * API.
> */
> -static char *io_tlb_start, *io_tlb_end;
> +static char **__io_tlb_start;
> +
> +static int alloc_io_tlb_chunks;
I was hoping you would base this on:
git://git.kernel.org/pub/scm/linux/kernel/git/konrad/swiotlb-2.6.git stable/swiotlb-0.8.3
branch as that is what I am going to ask Linus to pull in 2.6.36.
>
> /*
> - * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
> - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
> + * The number of IO TLB blocks (in groups of 64) betweeen
> + * io_tlb_start. This is command line adjustable via
> + * setup_io_tlb_npages.
> */
> static unsigned long io_tlb_nslabs;
>
> @@ -130,11 +133,11 @@ void swiotlb_print_info(void)
> unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
> phys_addr_t pstart, pend;
>
> - pstart = virt_to_phys(io_tlb_start);
> - pend = virt_to_phys(io_tlb_end);
> + pstart = virt_to_phys(__io_tlb_start[0]);
> + pend = virt_to_phys(__io_tlb_start[0] + (IO_TLB_SEGSIZE << IO_TLB_SHIFT));
>
> - printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
> - bytes >> 20, io_tlb_start, io_tlb_end);
> + printk(KERN_INFO "software IO TLB can be enlarged to %lu MB\n",
> + bytes >> 20);
> printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
> (unsigned long long)pstart,
> (unsigned long long)pend);
> @@ -154,20 +157,24 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
> io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
> }
>
> - bytes = io_tlb_nslabs << IO_TLB_SHIFT;
> + bytes = IO_TLB_SEGSIZE << IO_TLB_SHIFT;
> +
> + __io_tlb_start = alloc_bootmem(
> + (io_tlb_nslabs / IO_TLB_SEGSIZE) * sizeof(char *));
> + memset(__io_tlb_start, 0, (io_tlb_nslabs / IO_TLB_SEGSIZE) * sizeof(char *));
> + alloc_io_tlb_chunks = 1;
>
> /*
> * Get IO TLB memory from the low pages
> */
> - io_tlb_start = alloc_bootmem_low_pages(bytes);
> - if (!io_tlb_start)
> + __io_tlb_start[0] = alloc_bootmem_low_pages(bytes);
> + if (!__io_tlb_start[0])
> panic("Cannot allocate SWIOTLB buffer");
> - io_tlb_end = io_tlb_start + bytes;
>
> /*
> * Allocate and initialize the free list array. This array is used
> * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
> - * between io_tlb_start and io_tlb_end.
> + * between io_tlb_start.
> */
> io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
> for (i = 0; i < io_tlb_nslabs; i++)
> @@ -215,14 +222,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
> bytes = io_tlb_nslabs << IO_TLB_SHIFT;
>
> while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
> - io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
> - order);
> - if (io_tlb_start)
> + __io_tlb_start[0] = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
> + order);
> + if (__io_tlb_start[0])
> break;
> order--;
> }
>
> - if (!io_tlb_start)
> + if (!__io_tlb_start[0])
> goto cleanup1;
>
> if (order != get_order(bytes)) {
> @@ -231,13 +238,12 @@ swiotlb_late_init_with_default_size(size_t default_size)
> io_tlb_nslabs = SLABS_PER_PAGE << order;
> bytes = io_tlb_nslabs << IO_TLB_SHIFT;
> }
> - io_tlb_end = io_tlb_start + bytes;
> - memset(io_tlb_start, 0, bytes);
> + memset(__io_tlb_start[0], 0, bytes);
>
> /*
> * Allocate and initialize the free list array. This array is used
> * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
> - * between io_tlb_start and io_tlb_end.
> + * between io_tlb_start.
> */
> io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
> get_order(io_tlb_nslabs * sizeof(int)));
> @@ -280,9 +286,8 @@ cleanup3:
> sizeof(int)));
> io_tlb_list = NULL;
> cleanup2:
> - io_tlb_end = NULL;
> - free_pages((unsigned long)io_tlb_start, order);
> - io_tlb_start = NULL;
> + free_pages((unsigned long)__io_tlb_start[0], order);
> + __io_tlb_start[0] = NULL;
> cleanup1:
> io_tlb_nslabs = req_nslabs;
> return -ENOMEM;
> @@ -300,7 +305,7 @@ void __init swiotlb_free(void)
> get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
> free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
> sizeof(int)));
> - free_pages((unsigned long)io_tlb_start,
> + free_pages((unsigned long)__io_tlb_start[0],
> get_order(io_tlb_nslabs << IO_TLB_SHIFT));
> } else {
> free_bootmem_late(__pa(io_tlb_overflow_buffer),
> @@ -309,15 +314,36 @@ void __init swiotlb_free(void)
> io_tlb_nslabs * sizeof(phys_addr_t));
> free_bootmem_late(__pa(io_tlb_list),
> io_tlb_nslabs * sizeof(int));
> - free_bootmem_late(__pa(io_tlb_start),
> + free_bootmem_late(__pa(__io_tlb_start[0]),
> io_tlb_nslabs << IO_TLB_SHIFT);
> }
> }
>
> static int is_swiotlb_buffer(phys_addr_t paddr)
> {
> - return paddr >= virt_to_phys(io_tlb_start) &&
> - paddr < virt_to_phys(io_tlb_end);
> + unsigned long flags;
> + int i, ret = 0;
> + char *vstart;
> + phys_addr_t pstart, pend;
> +
> + spin_lock_irqsave(&io_tlb_lock, flags);
This spinlock- would be better to replace it with a r/w spinlock?
I am asking this b/c this routine 'is_swiotlb_buffer' ends up being
called during unmap/sync. The unmap part I think is not such a big deal
if it takes a bit of time, but the sync part.. Well, looking at the list
of DMA pages I see the e1000e/e100/igb allocate would it make sense to speed this
search up by using that type of spinlock?
> + for (i = 0; i < alloc_io_tlb_chunks; i++) {
> + vstart = __io_tlb_start[i];
> +
> + if (!vstart)
> + break;
> +
> + pstart = virt_to_phys(vstart);
> + pend = virt_to_phys(vstart + (IO_TLB_SEGSIZE << IO_TLB_SHIFT));
> +
> + if (paddr >= pstart && paddr < pend) {
> + ret = 1;
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&io_tlb_lock, flags);
> +
> + return ret;
> }
>
> /*
> @@ -361,6 +387,35 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
> }
> }
>
> +static int expand_io_tlb(void)
> +{
> + int order;
> + char *v;
> +
> + /* we can't expand anymore. */
> + if (alloc_io_tlb_chunks == io_tlb_nslabs / IO_TLB_SEGSIZE) {
> + printk("%s %d: can't expand swiotlb %d, %lu\n",
> + __func__, __LINE__,
> + alloc_io_tlb_chunks, io_tlb_nslabs);
> + return 1;
> + }
> +
> + order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT);
> +
> + printk("%s %d: tlb is expanded, %d\n", __func__, __LINE__,
> + alloc_io_tlb_chunks);
> +
> + v = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, order);
> + if (!v) {
> + printk("%s %d: swiotlb oom\n", __func__, __LINE__);
> + return 1;
> + }
> +
> + __io_tlb_start[alloc_io_tlb_chunks++] = v;
> +
> + return 0;
> +}
> +
> /*
> * Allocates bounce buffer and returns its kernel virtual address.
> */
> @@ -375,9 +430,13 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
> unsigned long mask;
> unsigned long offset_slots;
> unsigned long max_slots;
> + int tlb_chunk_index = 0;
> +
> +again:
> + BUG_ON(tlb_chunk_index >= alloc_io_tlb_chunks);
>
> mask = dma_get_seg_boundary(hwdev);
> - start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
> + start_dma_addr = swiotlb_virt_to_bus(hwdev, __io_tlb_start[tlb_chunk_index]) & mask;
<tangent>
Back in the past we spoke about expanding the SWIOTLB to make it
possible for other SWIOTLB users to do their own virt->phys. With this
I can see this still working, if:
- We exported the __io_tlb_start, so that the other library can expand
if required.
- Ditto for the spinlock: io_tlb_lock
- And also the alloc_io_tlb_chunks
- And some way of running the SWIOTLB library after the expand_io_tlb
call - so that it can make the new chunk physically contingous.
Perhaps it might be then time to revisit a registration mechanism?
This also might solve the problem that hpa has with the Xen-SWIOTLB
mucking around in pci-dma.c file.
The rough idea is to have a structure for the following routines:
- int (*detect)(void);
- void (*init)(void);
- int (*is_swiotlb)(dma_addr_t dev_addr, struct swiotlb_data *);
- int (*expand)(struct swiotlb_data *);
The 'detect' would be used in the 'pci_swiotlb_detect' as:
int __init pci_swiotlb_detect(void) {
return iotlb->detect();
}
and the 'init' similary for the 'pci_swiotlb_init'.
The 'is_swiotlb' and 'new_iotlb' would do what they need to do.
That is 'is_swiotlb' would determine if the bus address sits
within the IOTLB chunks. The 'expand' would do what 'expand_io_tlb'
does. But would use whatever neccessary mechanism to make sure it would
be contingous under the architecture it is running.
And the 'struct swiotlb_data' would contain the all of the
data to make decisiosn. This would include the spinlock,
alloc_io_tlb_chunks, io_tlb_start array, and io_tlb_nslabs.
</tangent>
>
> offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
>
> @@ -405,16 +464,17 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
> * request and allocate a buffer from that IO TLB pool.
> */
> spin_lock_irqsave(&io_tlb_lock, flags);
> - index = ALIGN(io_tlb_index, stride);
> - if (index >= io_tlb_nslabs)
> - index = 0;
> + index = 0;
> wrap = index;
>
> do {
> + unsigned int *tlb_list = io_tlb_list +
> + tlb_chunk_index * IO_TLB_SEGSIZE;
> +
> while (iommu_is_span_boundary(index, nslots, offset_slots,
> max_slots)) {
> index += stride;
> - if (index >= io_tlb_nslabs)
> + if (index >= IO_TLB_SEGSIZE)
> index = 0;
> if (index == wrap)
> goto not_found;
> @@ -425,30 +485,31 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
> * contiguous buffers, we allocate the buffers from that slot
> * and mark the entries as '0' indicating unavailable.
> */
> - if (io_tlb_list[index] >= nslots) {
> + if (tlb_list[index] >= nslots) {
> int count = 0;
>
> for (i = index; i < (int) (index + nslots); i++)
> - io_tlb_list[i] = 0;
> + tlb_list[i] = 0;
> for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
> - io_tlb_list[i] = ++count;
> - dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
> -
> - /*
> - * Update the indices to avoid searching in the next
> - * round.
> - */
> - io_tlb_index = ((index + nslots) < io_tlb_nslabs
> - ? (index + nslots) : 0);
> -
> + tlb_list[i] = ++count;
> + dma_addr = __io_tlb_start[tlb_chunk_index] + (index << IO_TLB_SHIFT);
> goto found;
> }
> index += stride;
> - if (index >= io_tlb_nslabs)
> + if (index >= IO_TLB_SEGSIZE)
> index = 0;
> } while (index != wrap);
>
> not_found:
> + if (tlb_chunk_index < io_tlb_nslabs / IO_TLB_SEGSIZE) {
> + tlb_chunk_index++;
> + if (tlb_chunk_index < alloc_io_tlb_chunks ||
> + !expand_io_tlb()) {
> + spin_unlock_irqrestore(&io_tlb_lock, flags);
> + goto again;
> + }
> + }
> +
> spin_unlock_irqrestore(&io_tlb_lock, flags);
> return NULL;
> found:
> @@ -460,13 +521,41 @@ found:
> * needed.
> */
> for (i = 0; i < nslots; i++)
> - io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
> + io_tlb_orig_addr[tlb_chunk_index * IO_TLB_SEGSIZE + index + i] = phys + (i << IO_TLB_SHIFT);
> if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
> swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
>
> return dma_addr;
> }
>
> +static int get_index(char *vaddr)
> +{
> + int i, index, ret = 0;
> + unsigned long flags;
> + char *vstart;
> +
> + spin_lock_irqsave(&io_tlb_lock, flags);
> + for (i = 0; i < alloc_io_tlb_chunks; i++) {
> + vstart = __io_tlb_start[i];
> +
> + if (!vstart)
> + break;
> +
> + if (vaddr >= vstart && vaddr < vstart +
> + (IO_TLB_SEGSIZE << IO_TLB_SHIFT)) {
> + ret = 1;
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&io_tlb_lock, flags);
> +
> + BUG_ON(!ret);
> +
> + index = (vaddr - __io_tlb_start[i]) >> IO_TLB_SHIFT;
> +
> + return (i * IO_TLB_SEGSIZE) + index;
> +}
> +
> /*
> * dma_addr is the kernel virtual address of the bounce buffer to unmap.
> */
> @@ -475,7 +564,7 @@ do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
> {
> unsigned long flags;
> int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
> - int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
> + int index = get_index(dma_addr);
> phys_addr_t phys = io_tlb_orig_addr[index];
>
> /*
> @@ -514,7 +603,7 @@ static void
> sync_single(struct device *hwdev, char *dma_addr, size_t size,
> int dir, int target)
> {
> - int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
> + int index = get_index(dma_addr);
> phys_addr_t phys = io_tlb_orig_addr[index];
>
> phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
> @@ -893,6 +982,7 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
> int
> swiotlb_dma_supported(struct device *hwdev, u64 mask)
> {
> - return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask;
> + char *vend = __io_tlb_start[0] + (io_tlb_nslabs << IO_TLB_SHIFT);
> + return swiotlb_virt_to_bus(hwdev, vend - 1) <= mask;
> }
> EXPORT_SYMBOL(swiotlb_dma_supported);
> --
> 1.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists