[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <62bfa26a-822b-462d-ba8d-e0d85610e278@redhat.com>
Date: Wed, 18 Jun 2025 13:17:31 -0400
From: John Meneghini <jmeneghi@...hat.com>
To: martin.petersen@...cle.com
Cc: arulponn@...co.com, djhawar@...co.com, gcboffa@...co.com,
mkai2@...co.com, satishkh@...co.com, aeasi@...co.com, jejb@...ux.ibm.com,
linux-scsi@...r.kernel.org, linux-kernel@...r.kernel.org, revers@...hat.com,
dan.carpenter@...aro.org, stable@...r.kernel.org, sebaddel@...co.com,
Karan Tilak Kumar <kartilak@...co.com>
Subject: Re: [PATCH v6 1/4] scsi: fnic: Fix crash in fnic_wq_cmpl_handler when
FDMI times out
Great Job. Thanks Karan.
Reviewed-by: John Meneghini <jmeneghi@...hat.com>
Martin, if possible, please include these in 6.16/scsi-fixes. These are critical bug fixes which are holding up the release of RHEL-9.7.
/John
On 6/17/25 8:34 PM, Karan Tilak Kumar wrote:
> When both the RHBA and RPA FDMI requests time out, fnic reuses a frame
> to send ABTS for each of them. On send completion, this causes an
> attempt to free the same frame twice that leads to a crash.
>
> Fix crash by allocating separate frames for RHBA and RPA,
> and modify ABTS logic accordingly.
>
> Tested by checking MDS for FDMI information.
> Tested by using instrumented driver to:
> Drop PLOGI response
> Drop RHBA response
> Drop RPA response
> Drop RHBA and RPA response
> Drop PLOGI response + ABTS response
> Drop RHBA response + ABTS response
> Drop RPA response + ABTS response
> Drop RHBA and RPA response + ABTS response for both of them
>
> Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI")
> Reviewed-by: Sesidhar Baddela <sebaddel@...co.com>
> Reviewed-by: Arulprabhu Ponnusamy <arulponn@...co.com>
> Reviewed-by: Gian Carlo Boffa <gcboffa@...co.com>
> Tested-by: Arun Easi <aeasi@...co.com>
> Co-developed-by: Arun Easi <aeasi@...co.com>
> Signed-off-by: Arun Easi <aeasi@...co.com>
> Tested-by: Karan Tilak Kumar <kartilak@...co.com>
> Cc: <stable@...r.kernel.org>
> Signed-off-by: Karan Tilak Kumar <kartilak@...co.com>
> ---
> Changes between v5 and v6:
> - Incorporate review comments from John:
> - Rebase patches on 6.17/scsi-queue
>
> Changes between v4 and v5:
> - Incorporate review comments from John:
> - Refactor patches
>
> Changes between v3 and v4:
> - Incorporate review comments from Dan:
> - Remove comments from Cc tag
>
> Changes between v2 and v3:
> - Incorporate review comments from Dan:
> - Add Cc to stable
>
> Changes between v1 and v2:
> - Incorporate review comments from Dan:
> - Add Fixes tag
> ---
> drivers/scsi/fnic/fdls_disc.c | 113 +++++++++++++++++++++++++---------
> drivers/scsi/fnic/fnic.h | 2 +-
> drivers/scsi/fnic/fnic_fdls.h | 1 +
> 3 files changed, 87 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c
> index f8ab69c51dab..36b498ad55b4 100644
> --- a/drivers/scsi/fnic/fdls_disc.c
> +++ b/drivers/scsi/fnic/fdls_disc.c
> @@ -763,47 +763,69 @@ static void fdls_send_fabric_abts(struct fnic_iport_s *iport)
> iport->fabric.timer_pending = 1;
> }
>
> -static void fdls_send_fdmi_abts(struct fnic_iport_s *iport)
> +static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport,
> + uint16_t oxid)
> {
> - uint8_t *frame;
> + struct fc_frame_header *pfdmi_abts;
> uint8_t d_id[3];
> + uint8_t *frame;
> struct fnic *fnic = iport->fnic;
> - struct fc_frame_header *pfabric_abts;
> - unsigned long fdmi_tov;
> - uint16_t oxid;
> - uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET +
> - sizeof(struct fc_frame_header);
>
> frame = fdls_alloc_frame(iport);
> if (frame == NULL) {
> FNIC_FCS_DBG(KERN_ERR, fnic->host, fnic->fnic_num,
> "Failed to allocate frame to send FDMI ABTS");
> - return;
> + return NULL;
> }
>
> - pfabric_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET);
> + pfdmi_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET);
> fdls_init_fabric_abts_frame(frame, iport);
>
> hton24(d_id, FC_FID_MGMT_SERV);
> - FNIC_STD_SET_D_ID(*pfabric_abts, d_id);
> + FNIC_STD_SET_D_ID(*pfdmi_abts, d_id);
> + FNIC_STD_SET_OX_ID(*pfdmi_abts, oxid);
> +
> + return frame;
> +}
> +
> +static void fdls_send_fdmi_abts(struct fnic_iport_s *iport)
> +{
> + uint8_t *frame;
> + unsigned long fdmi_tov;
> + uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET +
> + sizeof(struct fc_frame_header);
>
> if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) {
> - oxid = iport->active_oxid_fdmi_plogi;
> - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
> + frame = fdls_alloc_init_fdmi_abts_frame(iport,
> + iport->active_oxid_fdmi_plogi);
> + if (frame == NULL)
> + return;
> +
> fnic_send_fcoe_frame(iport, frame, frame_size);
> } else {
> if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) {
> - oxid = iport->active_oxid_fdmi_rhba;
> - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
> + frame = fdls_alloc_init_fdmi_abts_frame(iport,
> + iport->active_oxid_fdmi_rhba);
> + if (frame == NULL)
> + return;
> +
> fnic_send_fcoe_frame(iport, frame, frame_size);
> }
> if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) {
> - oxid = iport->active_oxid_fdmi_rpa;
> - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
> + frame = fdls_alloc_init_fdmi_abts_frame(iport,
> + iport->active_oxid_fdmi_rpa);
> + if (frame == NULL) {
> + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING)
> + goto arm_timer;
> + else
> + return;
> + }
> +
> fnic_send_fcoe_frame(iport, frame, frame_size);
> }
> }
>
> +arm_timer:
> fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov);
> mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov));
> iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING;
> @@ -2245,6 +2267,21 @@ void fdls_fabric_timer_callback(struct timer_list *t)
> spin_unlock_irqrestore(&fnic->fnic_lock, flags);
> }
>
> +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport)
> +{
> + struct fnic *fnic = iport->fnic;
> +
> + iport->fabric.fdmi_pending = 0;
> + /* If max retries not exhausted, start over from fdmi plogi */
> + if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) {
> + iport->fabric.fdmi_retry++;
> + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
> + "Retry FDMI PLOGI. FDMI retry: %d",
> + iport->fabric.fdmi_retry);
> + fdls_send_fdmi_plogi(iport);
> + }
> +}
> +
> void fdls_fdmi_timer_callback(struct timer_list *t)
> {
> struct fnic_fdls_fabric_s *fabric = timer_container_of(fabric, t,
> @@ -2291,14 +2328,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t)
> FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
> "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending);
>
> - iport->fabric.fdmi_pending = 0;
> - /* If max retries not exhaused, start over from fdmi plogi */
> - if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) {
> - iport->fabric.fdmi_retry++;
> - FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
> - "retry fdmi timer %d", iport->fabric.fdmi_retry);
> - fdls_send_fdmi_plogi(iport);
> - }
> + fdls_fdmi_retry_plogi(iport);
> FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
> "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending);
> spin_unlock_irqrestore(&fnic->fnic_lock, flags);
> @@ -3716,11 +3746,32 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport,
> switch (FNIC_FRAME_TYPE(oxid)) {
> case FNIC_FRAME_TYPE_FDMI_PLOGI:
> fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi);
> +
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING;
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
> break;
> case FNIC_FRAME_TYPE_FDMI_RHBA:
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING;
> +
> + /* If RPA is still pending, don't turn off ABORT PENDING.
> + * We count on the timer to detect the ABTS timeout and take
> + * corrective action.
> + */
> + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING))
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
> +
> fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba);
> break;
> case FNIC_FRAME_TYPE_FDMI_RPA:
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING;
> +
> + /* If RHBA is still pending, don't turn off ABORT PENDING.
> + * We count on the timer to detect the ABTS timeout and take
> + * corrective action.
> + */
> + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING))
> + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
> +
> fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa);
> break;
> default:
> @@ -3730,10 +3781,16 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport,
> break;
> }
>
> - timer_delete_sync(&iport->fabric.fdmi_timer);
> - iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
> -
> - fdls_send_fdmi_plogi(iport);
> + /*
> + * Only if ABORT PENDING is off, delete the timer, and if no other
> + * operations are pending, retry FDMI.
> + * Otherwise, let the timer pop and take the appropriate action.
> + */
> + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) {
> + timer_delete_sync(&iport->fabric.fdmi_timer);
> + if (!iport->fabric.fdmi_pending)
> + fdls_fdmi_retry_plogi(iport);
> + }
> }
>
> static void
> diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h
> index 6c5f6046b1f5..86e293ce530d 100644
> --- a/drivers/scsi/fnic/fnic.h
> +++ b/drivers/scsi/fnic/fnic.h
> @@ -30,7 +30,7 @@
>
> #define DRV_NAME "fnic"
> #define DRV_DESCRIPTION "Cisco FCoE HBA Driver"
> -#define DRV_VERSION "1.8.0.0"
> +#define DRV_VERSION "1.8.0.1"
> #define PFX DRV_NAME ": "
> #define DFX DRV_NAME "%d: "
>
> diff --git a/drivers/scsi/fnic/fnic_fdls.h b/drivers/scsi/fnic/fnic_fdls.h
> index 8e610b65ad57..531d0b37e450 100644
> --- a/drivers/scsi/fnic/fnic_fdls.h
> +++ b/drivers/scsi/fnic/fnic_fdls.h
> @@ -394,6 +394,7 @@ void fdls_send_tport_abts(struct fnic_iport_s *iport,
> bool fdls_delete_tport(struct fnic_iport_s *iport,
> struct fnic_tport_s *tport);
> void fdls_fdmi_timer_callback(struct timer_list *t);
> +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport);
>
> /* fnic_fcs.c */
> void fnic_fdls_init(struct fnic *fnic, int usefip);
Powered by blists - more mailing lists