[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f17065d6-5083-74c9-d9ca-a467b640aed3@arm.com>
Date: Mon, 12 Jul 2021 10:55:32 +0100
From: Suzuki K Poulose <suzuki.poulose@....com>
To: Leo Yan <leo.yan@...aro.org>,
Mathieu Poirier <mathieu.poirier@...aro.org>,
Mike Leach <mike.leach@...aro.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
coresight@...ts.linaro.org, linux-arm-kernel@...ts.infradead.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] coresight: tmc-etr: Speed up for bounce buffer in flat
mode
Leo,
On 10/07/2021 08:01, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
>
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
>
> This patch changes to allocate pages with alloc_pages_node(), thus the
> driver can access the memory with cacheable mapping in the kernel linear
> virtual address; therefore, because load instructions can fetch data
> from cache lines rather than always read data from DRAM, the driver can
> boost memory coping performance. After using the cacheable mapping, the
> driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
> read bounce buffer so can avoid read stale trace data.
>
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
>
> # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
> # echo tmc_update_etr_buffer > set_graph_function
> # echo function_graph > current_tracer
>
> before:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 8148.320 us | }
>
> after:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 2463.980 us | }
Thats a good speed up ! Thanks for the patch. One minor comment
below.
>
> Signed-off-by: Leo Yan <leo.yan@...aro.org>
> ---
>
> Changes from v1:
> Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
> unexpected if condition change in tmc_etr_free_flat_buf().
>
> .../hwtracing/coresight/coresight-tmc-etr.c | 56 ++++++++++++++++---
> 1 file changed, 49 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index acdb59e0e661..888b0f929d33 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -21,6 +21,7 @@
>
> struct etr_flat_buf {
> struct device *dev;
> + struct page *pages;
> dma_addr_t daddr;
> void *vaddr;
> size_t size;
> @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> {
> struct etr_flat_buf *flat_buf;
> struct device *real_dev = drvdata->csdev->dev.parent;
> + ssize_t aligned_size;
>
> /* We cannot reuse existing pages for flat buf */
> if (pages)
> @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> if (!flat_buf)
> return -ENOMEM;
>
> - flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> - &flat_buf->daddr, GFP_KERNEL);
> - if (!flat_buf->vaddr) {
> - kfree(flat_buf);
> - return -ENOMEM;
> + aligned_size = PAGE_ALIGN(etr_buf->size);
> + flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> + get_order(aligned_size));
> + if (!flat_buf->pages)
> + goto fail_alloc_pages;
> +
> + flat_buf->vaddr = page_address(flat_buf->pages);
> + flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> + aligned_size, DMA_FROM_DEVICE);
> + if (dma_mapping_error(real_dev, flat_buf->daddr)) {
> + flat_buf->daddr = 0;
> + goto fail_dma_map_page;
> }
>
> flat_buf->size = etr_buf->size;
> @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> etr_buf->mode = ETR_MODE_FLAT;
> etr_buf->private = flat_buf;
> return 0;
> +
> +fail_dma_map_page:
> + __free_pages(flat_buf->pages, get_order(aligned_size));
> +fail_alloc_pages:
> + kfree(flat_buf);
> + return -ENOMEM;
> }
>
> static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
>
> if (flat_buf && flat_buf->daddr) {
> struct device *real_dev = flat_buf->dev->parent;
> + ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
>
> - dma_free_coherent(real_dev, flat_buf->size,
> - flat_buf->vaddr, flat_buf->daddr);
> + dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
> + DMA_FROM_DEVICE);
> + __free_pages(flat_buf->pages, get_order(aligned_size));
> }
> kfree(flat_buf);
> }
>
> static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> {
> + struct etr_flat_buf *flat_buf = etr_buf->private;
> + struct device *real_dev = flat_buf->dev->parent;
> +
> /*
> * Adjust the buffer to point to the beginning of the trace data
> * and update the available trace data.
> @@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> etr_buf->len = etr_buf->size;
> else
> etr_buf->len = rwp - rrp;
> +
> + if (etr_buf->offset + etr_buf->len > etr_buf->size) {
> + int len1, len2;
> +
> + /*
> + * If trace data is wrapped around, sync AUX bounce buffer
> + * for two chunks: "len1" is for the trace date length at
> + * the tail of bounce buffer, and "len2" is the length from
> + * the start of the buffer after wrapping around.
> + */
> + len1 = etr_buf->size - etr_buf->offset;
> + len2 = etr_buf->len - len1;
> + dma_sync_single_for_cpu(real_dev,
> + flat_buf->daddr + etr_buf->offset,
> + len1, DMA_FROM_DEVICE);
> + dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> + len2, DMA_FROM_DEVICE);
We always start tracing at the beginning of the buffer and the only
reason why we would get a wrap around, is when the buffer is full.
So you could as well sync the entire buffer in one go
dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
etr_buf->len, DMA_FROM_DEVICE);
> + } else {
> + dma_sync_single_for_cpu(real_dev,
> + flat_buf->daddr + etr_buf->offset,
> + etr_buf->len, DMA_FROM_DEVICE);
> + }
> }
>
> static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
>
Eitherways,
Reviewed-by: Suzuki K Poulose <suzuki.poulose@....com>
Powered by blists - more mailing lists