lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <878ttdslfb.fsf@eliezer.anholt.net>
Date:   Mon, 24 Oct 2016 10:31:52 -0700
From:   Eric Anholt <eric@...olt.net>
To:     mzoran@...wfest.net, gregkh@...uxfoundation.org
Cc:     mzoran@...wfest.net, daniels@...labora.com, noralf@...nnes.org,
        popcornmix@...il.com, devel@...verdev.osuosl.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

mzoran@...wfest.net writes:

> From: Michael Zoran <mzoran@...wfest.net>
>
> The original arm implementation uses dmac_map_area which is not
> portable.  Replace it with an architecture neutral version
> which uses dma_map_sg.
>
> As you can see that for larger page sizes, the dma_map_sg
> implementation is faster then the original unportable dma_map_area
> implementation.  
>
> Test                       dmac_map_area   dma_map_page dma_map_sg
> vchiq_test -b 4 10000      51us/iter       76us/iter    76us
> vchiq_test -b 8 10000      70us/iter       82us/iter    91us
> vchiq_test -b 16 10000     94us/iter       118us/iter   121us
> vchiq_test -b 32 10000     146us/iter      173us/iter   187us
> vchiq_test -b 64 10000     263us/iter      328us/iter   299us
> vchiq_test -b 128 10000    529us/iter      631us/iter   595us
> vchiq_test -b 256 10000    2285us/iter     2275us/iter  2001us
> vchiq_test -b 512 10000    4372us/iter     4616us/iter  4123us
>
> For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.
>
> For message size >= 256KB, the dma_map_sg is the fastest
> implementation.
>
> Signed-off-by: Michael Zoran <mzoran@...wfest.net>
> ---
>  .../interface/vchiq_arm/vchiq_2835_arm.c           | 153 ++++++++++++---------
>  1 file changed, 91 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> index 98c6819..5ca66ee 100644
> --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> @@ -45,13 +45,8 @@
>  #include <asm/pgtable.h>
>  #include <soc/bcm2835/raspberrypi-firmware.h>
>  
> -#define dmac_map_area			__glue(_CACHE,_dma_map_area)
> -#define dmac_unmap_area 		__glue(_CACHE,_dma_unmap_area)
> -
>  #define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32)
>  
> -#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset))
> -
>  #include "vchiq_arm.h"
>  #include "vchiq_2835.h"
>  #include "vchiq_connected.h"
> @@ -73,7 +68,7 @@ static unsigned int g_fragments_size;
>  static char *g_fragments_base;
>  static char *g_free_fragments;
>  static struct semaphore g_free_fragments_sema;
> -static unsigned long g_virt_to_bus_offset;
> +static struct device *g_dev;
>  
>  extern int vchiq_arm_log_level;
>  
> @@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id);
>  
>  static int
>  create_pagelist(char __user *buf, size_t count, unsigned short type,
> -                struct task_struct *task, PAGELIST_T ** ppagelist);
> +		struct task_struct *task, PAGELIST_T **ppagelist,
> +		dma_addr_t *dma_addr);
>  
>  static void
> -free_pagelist(PAGELIST_T *pagelist, int actual);
> +free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual);
>  
>  int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
>  {
> @@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
>  	int slot_mem_size, frag_mem_size;
>  	int err, irq, i;
>  
> -	g_virt_to_bus_offset = virt_to_dma(dev, (void *)0);
> -
>  	(void)of_property_read_u32(dev->of_node, "cache-line-size",
>  				   &g_cache_line_size);
>  	g_fragments_size = 2 * g_cache_line_size;
> @@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
>  		return -ENOMEM;
>  	}
>  
> -	WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) != 0);
> +	WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) != 0);
>  
>  	vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size);
>  	if (!vchiq_slot_zero)
> @@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
>  		return err ? : -ENXIO;
>  	}
>  
> +	g_dev = dev;
>  	vchiq_log_info(vchiq_arm_log_level,
>  		"vchiq_init - done (slots %pK, phys %pad)",
>  		vchiq_slot_zero, &slot_phys);
> @@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
>  {
>  	PAGELIST_T *pagelist;
>  	int ret;
> +	dma_addr_t dma_addr;
>  
>  	WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID);
>  
> @@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
>  			? PAGELIST_READ
>  			: PAGELIST_WRITE,
>  			current,
> -			&pagelist);
> +			&pagelist,
> +			&dma_addr);
> +
>  	if (ret != 0)
>  		return VCHIQ_ERROR;
>  
>  	bulk->handle = memhandle;
> -	bulk->data = VCHIQ_ARM_ADDRESS(pagelist);
> +	bulk->data = (void *)(unsigned long)dma_addr;
>  
>  	/* Store the pagelist address in remote_data, which isn't used by the
>  	   slave. */
> @@ -259,7 +257,8 @@ void
>  vchiq_complete_bulk(VCHIQ_BULK_T *bulk)
>  {
>  	if (bulk && bulk->remote_data && bulk->actual)
> -		free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual);
> +		free_pagelist((dma_addr_t)(unsigned long)bulk->data,
> +			      (PAGELIST_T *)bulk->remote_data, bulk->actual);
>  }
>  
>  void
> @@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id)
>  ** obscuring the new data underneath. We can solve this by transferring the
>  ** partial cache lines separately, and allowing the ARM to copy into the
>  ** cached area.
> -
> -** N.B. This implementation plays slightly fast and loose with the Linux
> -** driver programming rules, e.g. its use of dmac_map_area instead of
> -** dma_map_single, but it isn't a multi-platform driver and it benefits
> -** from increased speed as a result.
>  */
>  
>  static int
>  create_pagelist(char __user *buf, size_t count, unsigned short type,
> -	struct task_struct *task, PAGELIST_T ** ppagelist)
> +		struct task_struct *task, PAGELIST_T **ppagelist,
> +		dma_addr_t *dma_addr)
>  {
>  	PAGELIST_T *pagelist;
>  	struct page **pages;
> -	unsigned long *addrs;
> -	unsigned int num_pages, offset, i;
> -	char *addr, *base_addr, *next_addr;
> -	int run, addridx, actual_pages;
> +	u32 *addrs;
> +	unsigned int num_pages, offset, i, j, k;
> +	int actual_pages;
>          unsigned long *need_release;
> +	size_t pagelist_size;
> +	struct scatterlist *scatterlist;
> +	int dma_buffers;
> +	int dir;
>  
> -	offset = (unsigned int)buf & (PAGE_SIZE - 1);
> +	offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1));
>  	num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE;
>  
> +	pagelist_size = sizeof(PAGELIST_T) +
> +			(num_pages * sizeof(unsigned long)) +
> +			sizeof(unsigned long) +
> +			(num_pages * sizeof(pages[0]) +
> +			(num_pages * sizeof(struct scatterlist)));
> +
>  	*ppagelist = NULL;
>  
> +	dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
>  	/* Allocate enough storage to hold the page pointers and the page
>  	** list
>  	*/
> -	pagelist = kmalloc(sizeof(PAGELIST_T) +
> -                           (num_pages * sizeof(unsigned long)) +
> -                           sizeof(unsigned long) +
> -                           (num_pages * sizeof(pages[0])),
> -                           GFP_KERNEL);
> +	pagelist = dma_zalloc_coherent(g_dev,
> +				       pagelist_size,
> +				       dma_addr,
> +				       GFP_KERNEL);
>  
>  	vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK",
>  			pagelist);
> @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
>  	addrs = pagelist->addrs;
>          need_release = (unsigned long *)(addrs + num_pages);
>  	pages = (struct page **)(addrs + num_pages + 1);
> +	scatterlist = (struct scatterlist *)(pages + num_pages);
>  
>  	if (is_vmalloc_addr(buf)) {
> -		int dir = (type == PAGELIST_WRITE) ?
> -			DMA_TO_DEVICE : DMA_FROM_DEVICE;
>  		unsigned long length = count;
>  		unsigned int off = offset;
>  
> @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
>  			if (bytes > length)
>  				bytes = length;
>  			pages[actual_pages] = pg;
> -			dmac_map_area(page_address(pg) + off, bytes, dir);
>  			length -= bytes;
>  			off = 0;
>  		}
> @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
>  				actual_pages--;
>  				put_page(pages[actual_pages]);
>  			}
> -			kfree(pagelist);
> +			dma_free_coherent(g_dev, pagelist_size,
> +					  pagelist, *dma_addr);
>  			if (actual_pages == 0)
>  				actual_pages = -ENOMEM;
>  			return actual_pages;
> @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
>  	pagelist->type = type;
>  	pagelist->offset = offset;
>  
> -	/* Group the pages into runs of contiguous pages */
> -
> -	base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
> -	next_addr = base_addr + PAGE_SIZE;
> -	addridx = 0;
> -	run = 0;
> -
> -	for (i = 1; i < num_pages; i++) {
> -		addr = VCHIQ_ARM_ADDRESS(page_address(pages[i]));
> -		if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) {
> -			next_addr += PAGE_SIZE;
> -			run++;
> -		} else {
> -			addrs[addridx] = (unsigned long)base_addr + run;
> -			addridx++;
> -			base_addr = addr;
> -			next_addr = addr + PAGE_SIZE;
> -			run = 0;
> -		}
> +	for (i = 0; i < num_pages; i++)
> +		sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0);
> +
> +	dma_buffers = dma_map_sg(g_dev,
> +				 scatterlist,
> +				 num_pages,
> +				 dir);
> +
> +	if (dma_buffers == 0) {
> +		dma_free_coherent(g_dev, pagelist_size,
> +				  pagelist, *dma_addr);
> +
> +		return -EINTR;
>  	}
>  
> -	addrs[addridx] = (unsigned long)base_addr + run;
> -	addridx++;
> +	k = 0;
> +	for (i = 0; i < dma_buffers;) {
> +		u32 section_length = 0;
> +
> +		for (j = i + 1; j < dma_buffers; j++) {
> +			if (sg_dma_address(scatterlist + j) !=
> +			    sg_dma_address(scatterlist + j - 1) + PAGE_SIZE) {
> +				break;
> +			}
> +			section_length++;
> +		}
> +
> +		addrs[k] = (u32)sg_dma_address(scatterlist + i) |
> +				section_length;

It looks like scatterlist may not be just an array, so I think this
whole loop wants to be something like:

for_each_sg(scatterlist, sg, num_pages, i) {
	u32 len = sg_dma_len(sg);
        u32 addr = sg_dma_address(sg);

        if (k > 0 && addrs[k - 1] & PAGE_MASK +
            (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) {
        	addrs[k - 1] += len;
        } else {
        	addrs[k++] = addr | len;
        }
}

Note the use of sg_dma_len(), since sg entries may be more than one
page.  I don't think any merging is actually happening on our platform
currently, thus why I left the inner loop.

Thanks for taking on doing this conversion!  This code's going to be so
much nicer when you're done.

Download attachment "signature.asc" of type "application/pgp-signature" (801 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ