lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <31b46812-72d5-4f9d-b55d-16a6e10afe7d@acm.org>
Date: Fri, 21 Mar 2025 09:20:05 -0700
From: Bart Van Assche <bvanassche@....org>
To: Neil Armstrong <neil.armstrong@...aro.org>,
 Alim Akhtar <alim.akhtar@...sung.com>, Avri Altman <avri.altman@....com>,
 "James E.J. Bottomley" <James.Bottomley@...senPartnership.com>,
 "Martin K. Petersen" <martin.petersen@...cle.com>
Cc: Manivannan Sadhasivam <manivannan.sadhasivam@...aro.org>,
 linux-arm-msm@...r.kernel.org, linux-scsi@...r.kernel.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC] ufs: delegate the interrupt service routine to a
 threaded irq handler

On 3/21/25 9:08 AM, Neil Armstrong wrote:
> On systems with a large number request slots and unavailable MCQ,
> the current design of the interrupt handler can delay handling of
> other subsystems interrupts causing display artifacts, GPU stalls
> or system firmware requests timeouts.
> 
> Since the interrupt routine can take quite some time, it's
> preferable to move it to a threaded handler and leave the
> hard interrupt handler save the status and disable the irq
> until processing is finished in the thread.
> 
> This fixes all encountered issued when running FIO tests
> on the Qualcomm SM8650 platform.
> 
> Example of errors reported on a loaded system:
>   [drm:dpu_encoder_frame_done_timeout:2706] [dpu error]enc32 frame done timeout
>   msm_dpu ae01000.display-controller: [drm:hangcheck_handler [msm]] *ERROR* 67.5.20.1: hangcheck detected gpu lockup rb 2!
>   msm_dpu ae01000.display-controller: [drm:hangcheck_handler [msm]] *ERROR* 67.5.20.1:     completed fence: 74285
>   msm_dpu ae01000.display-controller: [drm:hangcheck_handler [msm]] *ERROR* 67.5.20.1:     submitted fence: 74286
>   Error sending AMC RPMH requests (-110)
> 
> Reported bandwidth is not affected on various tests.
> 
> Signed-off-by: Neil Armstrong <neil.armstrong@...aro.org>
> ---
>   drivers/ufs/core/ufshcd.c | 43 ++++++++++++++++++++++++++++++++++++-------
>   include/ufs/ufshcd.h      |  2 ++
>   2 files changed, 38 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> index 0534390c2a35d0671156d79a4b1981a257d2fbfa..0fa3cb48ce0e39439afb0f6d334b835d9e496387 100644
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -6974,7 +6974,7 @@ static irqreturn_t ufshcd_sl_intr(struct ufs_hba *hba, u32 intr_status)
>   }
>   
>   /**
> - * ufshcd_intr - Main interrupt service routine
> + * ufshcd_intr - Threaded interrupt service routine
>    * @irq: irq number
>    * @__hba: pointer to adapter instance
>    *
> @@ -6982,7 +6982,7 @@ static irqreturn_t ufshcd_sl_intr(struct ufs_hba *hba, u32 intr_status)
>    *  IRQ_HANDLED - If interrupt is valid
>    *  IRQ_NONE    - If invalid interrupt
>    */
> -static irqreturn_t ufshcd_intr(int irq, void *__hba)
> +static irqreturn_t ufshcd_threaded_intr(int irq, void *__hba)
>   {
>   	u32 intr_status, enabled_intr_status = 0;
>   	irqreturn_t retval = IRQ_NONE;
> @@ -6990,8 +6990,6 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba)
>   	int retries = hba->nutrs;
>   
>   	intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> -	hba->ufs_stats.last_intr_status = intr_status;
> -	hba->ufs_stats.last_intr_ts = local_clock();
>   
>   	/*
>   	 * There could be max of hba->nutrs reqs in flight and in worst case
> @@ -7000,8 +6998,7 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba)
>   	 * again in a loop until we process all of the reqs before returning.
>   	 */
>   	while (intr_status && retries--) {
> -		enabled_intr_status =
> -			intr_status & ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
> +		enabled_intr_status = intr_status & hba->intr_en;
>   		ufshcd_writel(hba, intr_status, REG_INTERRUPT_STATUS);
>   		if (enabled_intr_status)
>   			retval |= ufshcd_sl_intr(hba, enabled_intr_status);
> @@ -7020,9 +7017,40 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba)
>   		ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: ");
>   	}
>   
> +	ufshcd_writel(hba, hba->intr_en, REG_INTERRUPT_ENABLE);
> +
>   	return retval;
>   }
>   
> +/**
> + * ufshcd_intr - Main interrupt service routine
> + * @irq: irq number
> + * @__hba: pointer to adapter instance
> + *
> + * Return:
> + *  IRQ_WAKE_THREAD - If interrupt is valid
> + *  IRQ_NONE	    - If invalid interrupt
> + */
> +static irqreturn_t ufshcd_intr(int irq, void *__hba)
> +{
> +	u32 intr_status, enabled_intr_status = 0;
> +	irqreturn_t retval = IRQ_NONE;
> +	struct ufs_hba *hba = __hba;
> +	int retries = hba->nutrs;
> +
> +	intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> +	hba->ufs_stats.last_intr_status = intr_status;
> +	hba->ufs_stats.last_intr_ts = local_clock();
> +
> +	if (unlikely(!intr_status))
> +		return IRQ_NONE;
> +
> +	hba->intr_en = ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
> +	ufshcd_writel(hba, 0, REG_INTERRUPT_ENABLE);
> +
> +	return IRQ_WAKE_THREAD;
> +}
> +
>   static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag)
>   {
>   	int err = 0;
> @@ -10581,7 +10609,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
>   	ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
>   
>   	/* IRQ registration */
> -	err = devm_request_irq(dev, irq, ufshcd_intr, IRQF_SHARED, UFSHCD, hba);
> +	err = devm_request_threaded_irq(dev, irq, ufshcd_intr, ufshcd_threaded_intr,
> +					IRQF_SHARED, UFSHCD, hba);
>   	if (err) {
>   		dev_err(hba->dev, "request irq failed\n");
>   		goto out_disable;
> diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
> index e3909cc691b2a854a270279901edacaa5c5120d6..03a7216b89fd63c297479422d1213e497ce85d8e 100644
> --- a/include/ufs/ufshcd.h
> +++ b/include/ufs/ufshcd.h
> @@ -893,6 +893,7 @@ enum ufshcd_mcq_opr {
>    * @ufshcd_state: UFSHCD state
>    * @eh_flags: Error handling flags
>    * @intr_mask: Interrupt Mask Bits
> + * @intr_en: Saved Interrupt Enable Bits
>    * @ee_ctrl_mask: Exception event control mask
>    * @ee_drv_mask: Exception event mask for driver
>    * @ee_usr_mask: Exception event mask for user (set via debugfs)
> @@ -1040,6 +1041,7 @@ struct ufs_hba {
>   	enum ufshcd_state ufshcd_state;
>   	u32 eh_flags;
>   	u32 intr_mask;
> +	u32 intr_en;
>   	u16 ee_ctrl_mask;
>   	u16 ee_drv_mask;
>   	u16 ee_usr_mask;

I don't like this patch because:
- It reduces performance (IOPS) for systems on which MCQ is supported
   and enabled. Please only use threaded interrupts if MCQ is not used.
- It introduces race conditions on the REG_INTERRUPT_ENABLE register.
   There are plenty of ufshcd_(enable|disable)_intr() calls in the UFS
   driver. Please remove all code that modifies REG_INTERRUPT_ENABLE
   from this patch.
- Instead of retaining hba->ufs_stats.last_intr_status and
   hba->ufs_stats.last_intr_ts, please remove both members and also
   the debug code that reports the values of these member variables.
   Please also remove hba->intr_en.

Thanks,

Bart.

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ