lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <43a9e692d8498b0e4469e4225d428e84@codeaurora.org>
Date:   Wed, 09 Aug 2017 19:48:36 +0530
From:   Abhishek Sahu <absahu@...eaurora.org>
To:     Sricharan R <sricharan@...eaurora.org>
Cc:     vinod.koul@...el.com, andy.gross@...aro.org,
        david.brown@...aro.org, dan.j.williams@...el.com,
        linux-arm-msm@...r.kernel.org, linux-soc@...r.kernel.org,
        dmaengine@...r.kernel.org, linux-kernel@...r.kernel.org,
        dmaengine-owner@...r.kernel.org
Subject: Re: [PATCH v2] dmaengine: qcom-bam: Process multiple pending
 descriptors

On 2017-08-03 18:51, Sricharan R wrote:
> The bam dmaengine has a circular FIFO to which we
> add hw descriptors that describes the transaction.
> The FIFO has space for about 4096 hw descriptors.
> 
> Currently we add one descriptor and wait for it to
> complete with interrupt and then add the next pending
> descriptor. In this way, the FIFO is underutilized
> since only one descriptor is processed at a time, although
> there is space in FIFO for the BAM to process more.
> 
> Instead keep adding descriptors to FIFO till its full,
> that allows BAM to continue to work on the next descriptor
> immediately after signalling completion interrupt for the
> previous descriptor.
> 
> Also when the client has not set the DMA_PREP_INTERRUPT for
> a descriptor, then do not configure BAM to trigger a interrupt
> upon completion of that descriptor. This way we get a interrupt
> only for the descriptor for which DMA_PREP_INTERRUPT was
> requested and there signal completion of all the previous completed
> descriptors. So we still do callbacks for all requested descriptors,
> but just that the number of interrupts are reduced.
> 
> CURRENT:
> 
>             ------      -------   ---------------
>             |DES 0|     |DESC 1|  |DESC 2 + INT |
>             ------      -------   ---------------
>                |           |            |
>                |           |            |
> INTERRUPT:   (INT)       (INT)	      (INT)
> CALLBACK:     (CB)        (CB)         (CB)
> 
> 		MTD_SPEEDTEST READ PAGE: 3560 KiB/s
> 		MTD_SPEEDTEST WRITE PAGE: 2664 KiB/s
> 		IOZONE READ: 2456 KB/s
> 		IOZONE WRITE: 1230 KB/s
> 
> 	bam dma interrupts (after tests): 96508
> 
> CHANGE:
> 
>         ------  -------    -------------
>         |DES 0| |DESC 1   |DESC 2 + INT |
>         ------  -------   --------------
> 				|
> 				|
>           		      (INT)
> 			      (CB for 0, 1, 2)
> 
> 		MTD_SPEEDTEST READ PAGE: 3860 KiB/s
> 		MTD_SPEEDTEST WRITE PAGE: 2837 KiB/s
> 		IOZONE READ: 2677 KB/s
> 		IOZONE WRITE: 1308 KB/s
> 
> 	bam dma interrupts (after tests): 58806
> 
> Signed-off-by: Sricharan R <sricharan@...eaurora.org>

  Thanks Sricharan for your patch to do the descriptor
  clubbing in BAM DMA driver.

  Verified this patch with my NAND QPIC patches
  https://www.spinics.net/lists/kernel/msg2573736.html

  I run the MTD test overnight and no failure
  observed. Also, achieved significant improvement in
  NAND speed. Following are the numbers for IPQ4019
  DK04 board.

  Test                    Speed in KiB/s
                         Before 	After

  eraseblock write speed  4716    5483
  eraseblock read speed   6855    8294
  page write speed        4678    5436
  page read speed         6784    8217

Tested-by: Abhishek Sahu <absahu@...eaurora.org>

  Also, I reviewed the patch and following are
  minor comments.

> ---
>  drivers/dma/qcom/bam_dma.c | 202
> +++++++++++++++++++++++++++++----------------
>  1 file changed, 129 insertions(+), 73 deletions(-)
> 
> diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
> index 6d89fb6..9e92d7c 100644
> --- a/drivers/dma/qcom/bam_dma.c
> +++ b/drivers/dma/qcom/bam_dma.c
> @@ -46,6 +46,7 @@
>  #include <linux/of_address.h>
>  #include <linux/of_irq.h>
>  #include <linux/of_dma.h>
> +#include <linux/circ_buf.h>
>  #include <linux/clk.h>
>  #include <linux/dmaengine.h>
>  #include <linux/pm_runtime.h>
> @@ -78,6 +79,8 @@ struct bam_async_desc {
> 
>  	struct bam_desc_hw *curr_desc;
> 
> +	/* list node for the desc in the bam_chan list of descriptors */
> +	struct list_head desc_node;
>  	enum dma_transfer_direction dir;
>  	size_t length;
>  	struct bam_desc_hw desc[0];
> @@ -356,7 +359,7 @@ struct bam_chan {
>  	/* configuration from device tree */
>  	u32 id;
> 
> -	struct bam_async_desc *curr_txd;	/* current running dma */
> +	bool is_busy; /* Set to true if FIFO is full */
> 
>  	/* runtime configuration */
>  	struct dma_slave_config slave;
> @@ -372,6 +375,8 @@ struct bam_chan {
>  	unsigned int initialized;	/* is the channel hw initialized?
> */
>  	unsigned int paused;		/* is the channel paused? */
>  	unsigned int reconfigure;	/* new slave config? */
> +	/* list of descriptors currently processed */
> +	struct list_head desc_list;
> 
>  	struct list_head node;
>  };
> @@ -539,7 +544,7 @@ static void bam_free_chan(struct dma_chan *chan)
> 
>  	vchan_free_chan_resources(to_virt_chan(chan));
> 
> -	if (bchan->curr_txd) {
> +	if (!list_empty(&bchan->desc_list)) {
>  		dev_err(bchan->bdev->dev, "Cannot free busy channel\n");
>  		goto err;
>  	}
> @@ -632,8 +637,6 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
> 
>  	if (flags & DMA_PREP_INTERRUPT)
>  		async_desc->flags |= DESC_FLAG_EOT;
> -	else
> -		async_desc->flags |= DESC_FLAG_INT;
> 
>  	async_desc->num_desc = num_alloc;
>  	async_desc->curr_desc = async_desc->desc;
> @@ -684,15 +687,14 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
>  static int bam_dma_terminate_all(struct dma_chan *chan)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	struct bam_async_desc *async_desc;
>  	unsigned long flag;
>  	LIST_HEAD(head);
> 
>  	/* remove all transactions, including active transaction */
>  	spin_lock_irqsave(&bchan->vc.lock, flag);
> -	if (bchan->curr_txd) {
> -		list_add(&bchan->curr_txd->vd.node,
> &bchan->vc.desc_issued);
> -		bchan->curr_txd = NULL;
> -	}
> +	list_for_each_entry(async_desc, &bchan->desc_list, desc_node)
> +		list_add(&async_desc->vd.node, &bchan->vc.desc_issued);

   should we free the list also since we are adding these descriptor
   back to issued and vchan_dma_desc_free_list will free all theses
   descriptors

   When the IRQ will be triggered then it will traverse this list
   and fetch the async descriptor for which already we freed the
   memory.

> 
>  	vchan_get_all_descriptors(&bchan->vc, &head);
>  	spin_unlock_irqrestore(&bchan->vc.lock, flag);
> @@ -763,9 +765,9 @@ static int bam_resume(struct dma_chan *chan)
>   */
>  static u32 process_channel_irqs(struct bam_device *bdev)
>  {
> -	u32 i, srcs, pipe_stts;
> +	u32 i, srcs, pipe_stts, offset, avail;
>  	unsigned long flags;
> -	struct bam_async_desc *async_desc;
> +	struct bam_async_desc *async_desc, *tmp;
> 
>  	srcs = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_SRCS_EE));
> 
> @@ -781,31 +783,51 @@ static u32 process_channel_irqs(struct bam_device
> *bdev)
> 
>  		/* clear pipe irq */
>  		pipe_stts = readl_relaxed(bam_addr(bdev, i,
> BAM_P_IRQ_STTS));
> -
>  		writel_relaxed(pipe_stts, bam_addr(bdev, i,
> BAM_P_IRQ_CLR));
> 
>  		spin_lock_irqsave(&bchan->vc.lock, flags);
> -		async_desc = bchan->curr_txd;
> -
> -		if (async_desc) {
> -			async_desc->num_desc -= async_desc->xfer_len;
> -			async_desc->curr_desc += async_desc->xfer_len;
> -			bchan->curr_txd = NULL;
> -
> -			/* manage FIFO */
> -			bchan->head += async_desc->xfer_len;
> -			bchan->head %= MAX_DESCRIPTORS;
> -
> -			/*
> -			 * if complete, process cookie.  Otherwise
> -			 * push back to front of desc_issued so that
> -			 * it gets restarted by the tasklet
> -			 */
> -			if (!async_desc->num_desc)
> -				vchan_cookie_complete(&async_desc->vd);
> -			else
> -				list_add(&async_desc->vd.node,
> -					&bchan->vc.desc_issued);
> +
> +		offset = readl_relaxed(bam_addr(bdev, i, BAM_P_SW_OFSTS))
> &
> +				       P_SW_OFSTS_MASK;
> +		offset /= sizeof(struct bam_desc_hw);
> +
> +		/* Number of bytes available to read */
> +		avail = CIRC_CNT(offset, bchan->head, MAX_DESCRIPTORS +
> 1);
> +
> +		list_for_each_entry_safe(async_desc, tmp,
> +					 &bchan->desc_list, desc_node) {
> +			if (async_desc) {

  Do we need this check since async_desc will be always not NULL.

> +				/* Not enough data to read */
> +				if (avail < async_desc->xfer_len)
> +					break;
> +
> +				/* manage FIFO */
> +				bchan->head += async_desc->xfer_len;
> +				bchan->head %= MAX_DESCRIPTORS;
> +
> +				async_desc->num_desc -=
> async_desc->xfer_len;
> +				async_desc->curr_desc +=
> async_desc->xfer_len;
> +				avail -= async_desc->xfer_len;
> +
> +				/*
> +				 * if complete, process cookie. Otherwise
> +				 * push back to front of desc_issued so
> that
> +				 * it gets restarted by the tasklet
> +				 */
> +				if (!async_desc->num_desc) {
> +
> vchan_cookie_complete(&async_desc->vd);
> +				} else {
> +					list_add(&async_desc->vd.node,
> +						 &bchan->vc.desc_issued);
> +				}
> +				list_del(&async_desc->desc_node);
> +
> +				/*
> +				 * one desc has completed, so there is
> space
> +				 * in FIFO
> +				 */
> +				bchan->is_busy = false;
> +			}
>  		}
> 
>  		spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -867,6 +889,7 @@ static enum dma_status bam_tx_status(struct 
> dma_chan
> *chan, dma_cookie_t cookie,
>  		struct dma_tx_state *txstate)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	struct bam_async_desc *async_desc;
>  	struct virt_dma_desc *vd;
>  	int ret;
>  	size_t residue = 0;
> @@ -882,11 +905,17 @@ static enum dma_status bam_tx_status(struct 
> dma_chan
> *chan, dma_cookie_t cookie,
> 
>  	spin_lock_irqsave(&bchan->vc.lock, flags);
>  	vd = vchan_find_desc(&bchan->vc, cookie);
> -	if (vd)
> +	if (vd) {
>  		residue = container_of(vd, struct bam_async_desc,
> vd)->length;
> -	else if (bchan->curr_txd && bchan->curr_txd->vd.tx.cookie ==
> cookie)
> -		for (i = 0; i < bchan->curr_txd->num_desc; i++)
> -			residue += bchan->curr_txd->curr_desc[i].size;
> +	} else {
> +		list_for_each_entry(async_desc, &bchan->desc_list,
> desc_node) {
> +			if (async_desc->vd.tx.cookie != cookie)
> +				continue;
> +
> +			for (i = 0; i < async_desc->num_desc; i++)
> +				residue += async_desc->curr_desc[i].size;
> +		}
> +	}
> 
>  	spin_unlock_irqrestore(&bchan->vc.lock, flags);
> 
> @@ -927,63 +956,89 @@ static void bam_start_dma(struct bam_chan *bchan)
>  {
>  	struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
>  	struct bam_device *bdev = bchan->bdev;
> -	struct bam_async_desc *async_desc;
> +	struct bam_async_desc *async_desc = NULL;
>  	struct bam_desc_hw *desc;
>  	struct bam_desc_hw *fifo = PTR_ALIGN(bchan->fifo_virt,
>  					sizeof(struct bam_desc_hw));
>  	int ret;
> +	unsigned int avail;
> +	struct dmaengine_desc_callback cb;
> 
>  	lockdep_assert_held(&bchan->vc.lock);
> 
>  	if (!vd)
>  		return;
> 
> -	list_del(&vd->node);
> -
> -	async_desc = container_of(vd, struct bam_async_desc, vd);
> -	bchan->curr_txd = async_desc;
> -
>  	ret = pm_runtime_get_sync(bdev->dev);
>  	if (ret < 0)
>  		return;
> 
> -	/* on first use, initialize the channel hardware */
> -	if (!bchan->initialized)
> -		bam_chan_init_hw(bchan, async_desc->dir);
> +	while (vd && !bchan->is_busy) {
> +		list_del(&vd->node);
> 
> -	/* apply new slave config changes, if necessary */
> -	if (bchan->reconfigure)
> -		bam_apply_new_config(bchan, async_desc->dir);
> +		async_desc = container_of(vd, struct bam_async_desc, vd);
> 
> -	desc = bchan->curr_txd->curr_desc;
> +		/* on first use, initialize the channel hardware */
> +		if (!bchan->initialized)
> +			bam_chan_init_hw(bchan, async_desc->dir);
> 
> -	if (async_desc->num_desc > MAX_DESCRIPTORS)
> -		async_desc->xfer_len = MAX_DESCRIPTORS;
> -	else
> -		async_desc->xfer_len = async_desc->num_desc;
> +		/* apply new slave config changes, if necessary */
> +		if (bchan->reconfigure)
> +			bam_apply_new_config(bchan, async_desc->dir);
> 
> -	/* set any special flags on the last descriptor */
> -	if (async_desc->num_desc == async_desc->xfer_len)
> -		desc[async_desc->xfer_len - 1].flags |=
> -					cpu_to_le16(async_desc->flags);
> -	else
> -		desc[async_desc->xfer_len - 1].flags |=
> -					cpu_to_le16(DESC_FLAG_INT);
> +		desc = async_desc->curr_desc;
> +		avail = CIRC_SPACE(bchan->tail, bchan->head,
> +				   MAX_DESCRIPTORS + 1);
> +
> +		if (async_desc->num_desc > avail)
> +			async_desc->xfer_len = avail;
> +		else
> +			async_desc->xfer_len = async_desc->num_desc;
> +
> +		/* set any special flags on the last descriptor */
> +		if (async_desc->num_desc == async_desc->xfer_len)
> +			desc[async_desc->xfer_len - 1].flags |=
> +
> cpu_to_le16(async_desc->flags);
> +
> +		vd = vchan_next_desc(&bchan->vc);
> +
> +		/* FIFO is FULL */
> +		bchan->is_busy = (avail <= async_desc->xfer_len) ? true :
> false;
> 
> -	if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
> -		u32 partial = MAX_DESCRIPTORS - bchan->tail;
> +		dmaengine_desc_get_callback(&async_desc->vd.tx, &cb);
> 
> -		memcpy(&fifo[bchan->tail], desc,
> -				partial * sizeof(struct bam_desc_hw));
> -		memcpy(fifo, &desc[partial], (async_desc->xfer_len -
> partial) *
> +		/*
> +		 * An interrupt is generated at this desc, if
> +		 *  - FIFO is FULL.
> +		 *  - No more descriptors to add.
> +		 *  - If a callback completion was requested for this
> DESC,
> +		 *     In this case, BAM will deliver the completion
> callback
> +		 *     for this desc and continue processing the next
> desc.
> +		 */
> +		if ((bchan->is_busy || !vd ||
> +		     dmaengine_desc_callback_valid(&cb)) &&
> +		    !(async_desc->flags & DESC_FLAG_EOT))
> +			desc[async_desc->xfer_len - 1].flags |=
> +				cpu_to_le16(DESC_FLAG_INT);
> +
> +		if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS)
> {
> +			u32 partial = MAX_DESCRIPTORS - bchan->tail;
> +
> +			memcpy(&fifo[bchan->tail], desc,
> +			       partial * sizeof(struct bam_desc_hw));
> +			memcpy(fifo, &desc[partial],
> +			       (async_desc->xfer_len - partial) *
>  				sizeof(struct bam_desc_hw));
> -	} else {
> -		memcpy(&fifo[bchan->tail], desc,
> -			async_desc->xfer_len * sizeof(struct
> bam_desc_hw));
> -	}
> +		} else {
> +			memcpy(&fifo[bchan->tail], desc,
> +			       async_desc->xfer_len *
> +			       sizeof(struct bam_desc_hw));
> +		}
> 
> -	bchan->tail += async_desc->xfer_len;
> -	bchan->tail %= MAX_DESCRIPTORS;
> +		bchan->tail += async_desc->xfer_len;
> +		bchan->tail %= MAX_DESCRIPTORS;
> +		list_add_tail(&async_desc->desc_node, &bchan->desc_list);
> +	}
> 
>  	/* ensure descriptor writes and dma start not reordered */
>  	wmb();
> @@ -1012,7 +1067,7 @@ static void dma_tasklet(unsigned long data)
>  		bchan = &bdev->channels[i];
>  		spin_lock_irqsave(&bchan->vc.lock, flags);
> 
> -		if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->curr_txd)
> +		if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->is_busy)
>  			bam_start_dma(bchan);
>  		spin_unlock_irqrestore(&bchan->vc.lock, flags);
>  	}
> @@ -1033,7 +1088,7 @@ static void bam_issue_pending(struct dma_chan 
> *chan)
>  	spin_lock_irqsave(&bchan->vc.lock, flags);
> 
>  	/* if work pending and idle, start a transaction */
> -	if (vchan_issue_pending(&bchan->vc) && !bchan->curr_txd)
> +	if (vchan_issue_pending(&bchan->vc) && !bchan->is_busy)
>  		bam_start_dma(bchan);

  can we get rid of these bchan->is_busy since it is being used
  whether we have space in actual hardware FIFO and same we can
  check with CIRC_SPACE(bchan->tail, bchan->head, MAX_DESCRIPTORS + 1)

  Thanks,
  Abhishek

> 
>  	spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -1133,6 +1188,7 @@ static void bam_channel_init(struct bam_device
> *bdev, struct bam_chan *bchan,
> 
>  	vchan_init(&bchan->vc, &bdev->common);
>  	bchan->vc.desc_free = bam_dma_free_desc;
> +	INIT_LIST_HEAD(&bchan->desc_list);
>  }
> 
>  static const struct of_device_id bam_of_match[] = {

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ