[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <43a9e692d8498b0e4469e4225d428e84@codeaurora.org>
Date: Wed, 09 Aug 2017 19:48:36 +0530
From: Abhishek Sahu <absahu@...eaurora.org>
To: Sricharan R <sricharan@...eaurora.org>
Cc: vinod.koul@...el.com, andy.gross@...aro.org,
david.brown@...aro.org, dan.j.williams@...el.com,
linux-arm-msm@...r.kernel.org, linux-soc@...r.kernel.org,
dmaengine@...r.kernel.org, linux-kernel@...r.kernel.org,
dmaengine-owner@...r.kernel.org
Subject: Re: [PATCH v2] dmaengine: qcom-bam: Process multiple pending
descriptors
On 2017-08-03 18:51, Sricharan R wrote:
> The bam dmaengine has a circular FIFO to which we
> add hw descriptors that describes the transaction.
> The FIFO has space for about 4096 hw descriptors.
>
> Currently we add one descriptor and wait for it to
> complete with interrupt and then add the next pending
> descriptor. In this way, the FIFO is underutilized
> since only one descriptor is processed at a time, although
> there is space in FIFO for the BAM to process more.
>
> Instead keep adding descriptors to FIFO till its full,
> that allows BAM to continue to work on the next descriptor
> immediately after signalling completion interrupt for the
> previous descriptor.
>
> Also when the client has not set the DMA_PREP_INTERRUPT for
> a descriptor, then do not configure BAM to trigger a interrupt
> upon completion of that descriptor. This way we get a interrupt
> only for the descriptor for which DMA_PREP_INTERRUPT was
> requested and there signal completion of all the previous completed
> descriptors. So we still do callbacks for all requested descriptors,
> but just that the number of interrupts are reduced.
>
> CURRENT:
>
> ------ ------- ---------------
> |DES 0| |DESC 1| |DESC 2 + INT |
> ------ ------- ---------------
> | | |
> | | |
> INTERRUPT: (INT) (INT) (INT)
> CALLBACK: (CB) (CB) (CB)
>
> MTD_SPEEDTEST READ PAGE: 3560 KiB/s
> MTD_SPEEDTEST WRITE PAGE: 2664 KiB/s
> IOZONE READ: 2456 KB/s
> IOZONE WRITE: 1230 KB/s
>
> bam dma interrupts (after tests): 96508
>
> CHANGE:
>
> ------ ------- -------------
> |DES 0| |DESC 1 |DESC 2 + INT |
> ------ ------- --------------
> |
> |
> (INT)
> (CB for 0, 1, 2)
>
> MTD_SPEEDTEST READ PAGE: 3860 KiB/s
> MTD_SPEEDTEST WRITE PAGE: 2837 KiB/s
> IOZONE READ: 2677 KB/s
> IOZONE WRITE: 1308 KB/s
>
> bam dma interrupts (after tests): 58806
>
> Signed-off-by: Sricharan R <sricharan@...eaurora.org>
Thanks Sricharan for your patch to do the descriptor
clubbing in BAM DMA driver.
Verified this patch with my NAND QPIC patches
https://www.spinics.net/lists/kernel/msg2573736.html
I run the MTD test overnight and no failure
observed. Also, achieved significant improvement in
NAND speed. Following are the numbers for IPQ4019
DK04 board.
Test Speed in KiB/s
Before After
eraseblock write speed 4716 5483
eraseblock read speed 6855 8294
page write speed 4678 5436
page read speed 6784 8217
Tested-by: Abhishek Sahu <absahu@...eaurora.org>
Also, I reviewed the patch and following are
minor comments.
> ---
> drivers/dma/qcom/bam_dma.c | 202
> +++++++++++++++++++++++++++++----------------
> 1 file changed, 129 insertions(+), 73 deletions(-)
>
> diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
> index 6d89fb6..9e92d7c 100644
> --- a/drivers/dma/qcom/bam_dma.c
> +++ b/drivers/dma/qcom/bam_dma.c
> @@ -46,6 +46,7 @@
> #include <linux/of_address.h>
> #include <linux/of_irq.h>
> #include <linux/of_dma.h>
> +#include <linux/circ_buf.h>
> #include <linux/clk.h>
> #include <linux/dmaengine.h>
> #include <linux/pm_runtime.h>
> @@ -78,6 +79,8 @@ struct bam_async_desc {
>
> struct bam_desc_hw *curr_desc;
>
> + /* list node for the desc in the bam_chan list of descriptors */
> + struct list_head desc_node;
> enum dma_transfer_direction dir;
> size_t length;
> struct bam_desc_hw desc[0];
> @@ -356,7 +359,7 @@ struct bam_chan {
> /* configuration from device tree */
> u32 id;
>
> - struct bam_async_desc *curr_txd; /* current running dma */
> + bool is_busy; /* Set to true if FIFO is full */
>
> /* runtime configuration */
> struct dma_slave_config slave;
> @@ -372,6 +375,8 @@ struct bam_chan {
> unsigned int initialized; /* is the channel hw initialized?
> */
> unsigned int paused; /* is the channel paused? */
> unsigned int reconfigure; /* new slave config? */
> + /* list of descriptors currently processed */
> + struct list_head desc_list;
>
> struct list_head node;
> };
> @@ -539,7 +544,7 @@ static void bam_free_chan(struct dma_chan *chan)
>
> vchan_free_chan_resources(to_virt_chan(chan));
>
> - if (bchan->curr_txd) {
> + if (!list_empty(&bchan->desc_list)) {
> dev_err(bchan->bdev->dev, "Cannot free busy channel\n");
> goto err;
> }
> @@ -632,8 +637,6 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
>
> if (flags & DMA_PREP_INTERRUPT)
> async_desc->flags |= DESC_FLAG_EOT;
> - else
> - async_desc->flags |= DESC_FLAG_INT;
>
> async_desc->num_desc = num_alloc;
> async_desc->curr_desc = async_desc->desc;
> @@ -684,15 +687,14 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
> static int bam_dma_terminate_all(struct dma_chan *chan)
> {
> struct bam_chan *bchan = to_bam_chan(chan);
> + struct bam_async_desc *async_desc;
> unsigned long flag;
> LIST_HEAD(head);
>
> /* remove all transactions, including active transaction */
> spin_lock_irqsave(&bchan->vc.lock, flag);
> - if (bchan->curr_txd) {
> - list_add(&bchan->curr_txd->vd.node,
> &bchan->vc.desc_issued);
> - bchan->curr_txd = NULL;
> - }
> + list_for_each_entry(async_desc, &bchan->desc_list, desc_node)
> + list_add(&async_desc->vd.node, &bchan->vc.desc_issued);
should we free the list also since we are adding these descriptor
back to issued and vchan_dma_desc_free_list will free all theses
descriptors
When the IRQ will be triggered then it will traverse this list
and fetch the async descriptor for which already we freed the
memory.
>
> vchan_get_all_descriptors(&bchan->vc, &head);
> spin_unlock_irqrestore(&bchan->vc.lock, flag);
> @@ -763,9 +765,9 @@ static int bam_resume(struct dma_chan *chan)
> */
> static u32 process_channel_irqs(struct bam_device *bdev)
> {
> - u32 i, srcs, pipe_stts;
> + u32 i, srcs, pipe_stts, offset, avail;
> unsigned long flags;
> - struct bam_async_desc *async_desc;
> + struct bam_async_desc *async_desc, *tmp;
>
> srcs = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_SRCS_EE));
>
> @@ -781,31 +783,51 @@ static u32 process_channel_irqs(struct bam_device
> *bdev)
>
> /* clear pipe irq */
> pipe_stts = readl_relaxed(bam_addr(bdev, i,
> BAM_P_IRQ_STTS));
> -
> writel_relaxed(pipe_stts, bam_addr(bdev, i,
> BAM_P_IRQ_CLR));
>
> spin_lock_irqsave(&bchan->vc.lock, flags);
> - async_desc = bchan->curr_txd;
> -
> - if (async_desc) {
> - async_desc->num_desc -= async_desc->xfer_len;
> - async_desc->curr_desc += async_desc->xfer_len;
> - bchan->curr_txd = NULL;
> -
> - /* manage FIFO */
> - bchan->head += async_desc->xfer_len;
> - bchan->head %= MAX_DESCRIPTORS;
> -
> - /*
> - * if complete, process cookie. Otherwise
> - * push back to front of desc_issued so that
> - * it gets restarted by the tasklet
> - */
> - if (!async_desc->num_desc)
> - vchan_cookie_complete(&async_desc->vd);
> - else
> - list_add(&async_desc->vd.node,
> - &bchan->vc.desc_issued);
> +
> + offset = readl_relaxed(bam_addr(bdev, i, BAM_P_SW_OFSTS))
> &
> + P_SW_OFSTS_MASK;
> + offset /= sizeof(struct bam_desc_hw);
> +
> + /* Number of bytes available to read */
> + avail = CIRC_CNT(offset, bchan->head, MAX_DESCRIPTORS +
> 1);
> +
> + list_for_each_entry_safe(async_desc, tmp,
> + &bchan->desc_list, desc_node) {
> + if (async_desc) {
Do we need this check since async_desc will be always not NULL.
> + /* Not enough data to read */
> + if (avail < async_desc->xfer_len)
> + break;
> +
> + /* manage FIFO */
> + bchan->head += async_desc->xfer_len;
> + bchan->head %= MAX_DESCRIPTORS;
> +
> + async_desc->num_desc -=
> async_desc->xfer_len;
> + async_desc->curr_desc +=
> async_desc->xfer_len;
> + avail -= async_desc->xfer_len;
> +
> + /*
> + * if complete, process cookie. Otherwise
> + * push back to front of desc_issued so
> that
> + * it gets restarted by the tasklet
> + */
> + if (!async_desc->num_desc) {
> +
> vchan_cookie_complete(&async_desc->vd);
> + } else {
> + list_add(&async_desc->vd.node,
> + &bchan->vc.desc_issued);
> + }
> + list_del(&async_desc->desc_node);
> +
> + /*
> + * one desc has completed, so there is
> space
> + * in FIFO
> + */
> + bchan->is_busy = false;
> + }
> }
>
> spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -867,6 +889,7 @@ static enum dma_status bam_tx_status(struct
> dma_chan
> *chan, dma_cookie_t cookie,
> struct dma_tx_state *txstate)
> {
> struct bam_chan *bchan = to_bam_chan(chan);
> + struct bam_async_desc *async_desc;
> struct virt_dma_desc *vd;
> int ret;
> size_t residue = 0;
> @@ -882,11 +905,17 @@ static enum dma_status bam_tx_status(struct
> dma_chan
> *chan, dma_cookie_t cookie,
>
> spin_lock_irqsave(&bchan->vc.lock, flags);
> vd = vchan_find_desc(&bchan->vc, cookie);
> - if (vd)
> + if (vd) {
> residue = container_of(vd, struct bam_async_desc,
> vd)->length;
> - else if (bchan->curr_txd && bchan->curr_txd->vd.tx.cookie ==
> cookie)
> - for (i = 0; i < bchan->curr_txd->num_desc; i++)
> - residue += bchan->curr_txd->curr_desc[i].size;
> + } else {
> + list_for_each_entry(async_desc, &bchan->desc_list,
> desc_node) {
> + if (async_desc->vd.tx.cookie != cookie)
> + continue;
> +
> + for (i = 0; i < async_desc->num_desc; i++)
> + residue += async_desc->curr_desc[i].size;
> + }
> + }
>
> spin_unlock_irqrestore(&bchan->vc.lock, flags);
>
> @@ -927,63 +956,89 @@ static void bam_start_dma(struct bam_chan *bchan)
> {
> struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
> struct bam_device *bdev = bchan->bdev;
> - struct bam_async_desc *async_desc;
> + struct bam_async_desc *async_desc = NULL;
> struct bam_desc_hw *desc;
> struct bam_desc_hw *fifo = PTR_ALIGN(bchan->fifo_virt,
> sizeof(struct bam_desc_hw));
> int ret;
> + unsigned int avail;
> + struct dmaengine_desc_callback cb;
>
> lockdep_assert_held(&bchan->vc.lock);
>
> if (!vd)
> return;
>
> - list_del(&vd->node);
> -
> - async_desc = container_of(vd, struct bam_async_desc, vd);
> - bchan->curr_txd = async_desc;
> -
> ret = pm_runtime_get_sync(bdev->dev);
> if (ret < 0)
> return;
>
> - /* on first use, initialize the channel hardware */
> - if (!bchan->initialized)
> - bam_chan_init_hw(bchan, async_desc->dir);
> + while (vd && !bchan->is_busy) {
> + list_del(&vd->node);
>
> - /* apply new slave config changes, if necessary */
> - if (bchan->reconfigure)
> - bam_apply_new_config(bchan, async_desc->dir);
> + async_desc = container_of(vd, struct bam_async_desc, vd);
>
> - desc = bchan->curr_txd->curr_desc;
> + /* on first use, initialize the channel hardware */
> + if (!bchan->initialized)
> + bam_chan_init_hw(bchan, async_desc->dir);
>
> - if (async_desc->num_desc > MAX_DESCRIPTORS)
> - async_desc->xfer_len = MAX_DESCRIPTORS;
> - else
> - async_desc->xfer_len = async_desc->num_desc;
> + /* apply new slave config changes, if necessary */
> + if (bchan->reconfigure)
> + bam_apply_new_config(bchan, async_desc->dir);
>
> - /* set any special flags on the last descriptor */
> - if (async_desc->num_desc == async_desc->xfer_len)
> - desc[async_desc->xfer_len - 1].flags |=
> - cpu_to_le16(async_desc->flags);
> - else
> - desc[async_desc->xfer_len - 1].flags |=
> - cpu_to_le16(DESC_FLAG_INT);
> + desc = async_desc->curr_desc;
> + avail = CIRC_SPACE(bchan->tail, bchan->head,
> + MAX_DESCRIPTORS + 1);
> +
> + if (async_desc->num_desc > avail)
> + async_desc->xfer_len = avail;
> + else
> + async_desc->xfer_len = async_desc->num_desc;
> +
> + /* set any special flags on the last descriptor */
> + if (async_desc->num_desc == async_desc->xfer_len)
> + desc[async_desc->xfer_len - 1].flags |=
> +
> cpu_to_le16(async_desc->flags);
> +
> + vd = vchan_next_desc(&bchan->vc);
> +
> + /* FIFO is FULL */
> + bchan->is_busy = (avail <= async_desc->xfer_len) ? true :
> false;
>
> - if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
> - u32 partial = MAX_DESCRIPTORS - bchan->tail;
> + dmaengine_desc_get_callback(&async_desc->vd.tx, &cb);
>
> - memcpy(&fifo[bchan->tail], desc,
> - partial * sizeof(struct bam_desc_hw));
> - memcpy(fifo, &desc[partial], (async_desc->xfer_len -
> partial) *
> + /*
> + * An interrupt is generated at this desc, if
> + * - FIFO is FULL.
> + * - No more descriptors to add.
> + * - If a callback completion was requested for this
> DESC,
> + * In this case, BAM will deliver the completion
> callback
> + * for this desc and continue processing the next
> desc.
> + */
> + if ((bchan->is_busy || !vd ||
> + dmaengine_desc_callback_valid(&cb)) &&
> + !(async_desc->flags & DESC_FLAG_EOT))
> + desc[async_desc->xfer_len - 1].flags |=
> + cpu_to_le16(DESC_FLAG_INT);
> +
> + if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS)
> {
> + u32 partial = MAX_DESCRIPTORS - bchan->tail;
> +
> + memcpy(&fifo[bchan->tail], desc,
> + partial * sizeof(struct bam_desc_hw));
> + memcpy(fifo, &desc[partial],
> + (async_desc->xfer_len - partial) *
> sizeof(struct bam_desc_hw));
> - } else {
> - memcpy(&fifo[bchan->tail], desc,
> - async_desc->xfer_len * sizeof(struct
> bam_desc_hw));
> - }
> + } else {
> + memcpy(&fifo[bchan->tail], desc,
> + async_desc->xfer_len *
> + sizeof(struct bam_desc_hw));
> + }
>
> - bchan->tail += async_desc->xfer_len;
> - bchan->tail %= MAX_DESCRIPTORS;
> + bchan->tail += async_desc->xfer_len;
> + bchan->tail %= MAX_DESCRIPTORS;
> + list_add_tail(&async_desc->desc_node, &bchan->desc_list);
> + }
>
> /* ensure descriptor writes and dma start not reordered */
> wmb();
> @@ -1012,7 +1067,7 @@ static void dma_tasklet(unsigned long data)
> bchan = &bdev->channels[i];
> spin_lock_irqsave(&bchan->vc.lock, flags);
>
> - if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->curr_txd)
> + if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->is_busy)
> bam_start_dma(bchan);
> spin_unlock_irqrestore(&bchan->vc.lock, flags);
> }
> @@ -1033,7 +1088,7 @@ static void bam_issue_pending(struct dma_chan
> *chan)
> spin_lock_irqsave(&bchan->vc.lock, flags);
>
> /* if work pending and idle, start a transaction */
> - if (vchan_issue_pending(&bchan->vc) && !bchan->curr_txd)
> + if (vchan_issue_pending(&bchan->vc) && !bchan->is_busy)
> bam_start_dma(bchan);
can we get rid of these bchan->is_busy since it is being used
whether we have space in actual hardware FIFO and same we can
check with CIRC_SPACE(bchan->tail, bchan->head, MAX_DESCRIPTORS + 1)
Thanks,
Abhishek
>
> spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -1133,6 +1188,7 @@ static void bam_channel_init(struct bam_device
> *bdev, struct bam_chan *bchan,
>
> vchan_init(&bchan->vc, &bdev->common);
> bchan->vc.desc_free = bam_dma_free_desc;
> + INIT_LIST_HEAD(&bchan->desc_list);
> }
>
> static const struct of_device_id bam_of_match[] = {
Powered by blists - more mailing lists