lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0d722665-140d-8391-2cab-9a3ef0d5d0e7@gmail.com>
Date:   Thu, 26 Jan 2023 08:53:28 +0200
From:   Tariq Toukan <ttoukan.linux@...il.com>
To:     Vadim Fedorenko <vfedorenko@...ek.ru>,
        Vadim Fedorenko <vadfed@...com>, Aya Levin <ayal@...dia.com>,
        Saeed Mahameed <saeedm@...dia.com>,
        Jakub Kicinski <kuba@...nel.org>, Gal Pressman <gal@...dia.com>
Cc:     Vadim Fedorenko <vadfed@...a.com>, netdev@...r.kernel.org,
        Tariq Toukan <tariqt@...dia.com>
Subject: Re: [PATCH net v3 2/2] mlx5: fix possible ptp queue fifo
 use-after-free



On 26/01/2023 3:02, Vadim Fedorenko wrote:
> From: Vadim Fedorenko <vadfed@...a.com>
> 
> Fifo pointers were not checked during push and pop operations and this
> could potentially lead to use-after-free or skb leak under heavy PTP
> traffic.
> 
> Also there were OOO cqe spotted which lead to drain of the queue and
> use-after-free because of lack of fifo pointers check. Special check
> is added to avoid resync operation if SKB could not exist in the fifo
> because of OOO cqe (skb_id must be between consumer and producer index).
> 

Hi,

Let's hold on with this patch.
I don't think we understand the root cause. I'm also not sure this patch 
doesn't degrade the successful flow. See comment below.

We don't expect an xmit operation coming from the kernel while the TXQ 
is stopped. This might be the reason for the fifo overflow. Does it 
happen? If so, let's understand why and fix.

Your fix to mlx5e_skb_fifo_has_room() should help with preventing the 
fifo overflow. Does the issue still occur even after your patch [1]?

Also, it's not easy to decisively determine that a CQE arrived OOO. I 
doubt this can happen. The SQ is cyclic and works in-order. It's more 
probably a full cycle of lost CQEs.

BTW, what value do you see in your environment for
MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter) ?

Thanks,
Tariq

[1] [PATCH net v3 1/2] mlx5: fix skb leak while fifo resync and push

> Fixes: 58a518948f60 ("net/mlx5e: Add resiliency for PTP TX port timestamp")
> Signed-off-by: Vadim Fedorenko <vadfed@...a.com>
> ---
>   .../net/ethernet/mellanox/mlx5/core/en/ptp.c  | 23 ++++++++++++++-----
>   .../net/ethernet/mellanox/mlx5/core/en/txrx.h |  7 +++++-
>   2 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
> index b72de2b520ec..4ac7483dcbcc 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
> @@ -86,7 +86,7 @@ static bool mlx5e_ptp_ts_cqe_drop(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, u16 skb
>   	return (ptpsq->ts_cqe_ctr_mask && (skb_cc != skb_id));
>   }
>   
> -static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_cc,
> +static bool mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_cc,
>   					     u16 skb_id, int budget)
>   {
>   	struct skb_shared_hwtstamps hwts = {};
> @@ -94,14 +94,23 @@ static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_
>   
>   	ptpsq->cq_stats->resync_event++;
>   
> -	while (skb_cc != skb_id) {
> -		skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
> +	if (skb_cc > skb_id || PTP_WQE_CTR2IDX(ptpsq->skb_fifo_pc) < skb_id)

This can give false positives near the edge of the fifo (wraparound).

> +		pr_err_ratelimited("mlx5e: out-of-order ptp cqe\n");
> +		return false;
> +	}
> +
> +	while (skb_cc != skb_id && (skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo))) {
>   		hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp;
>   		skb_tstamp_tx(skb, &hwts);
>   		ptpsq->cq_stats->resync_cqe++;
>   		napi_consume_skb(skb, budget);
>   		skb_cc = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
>   	}
> +
> +	if (!skb)
> +		return false;
> +
> +	return true;
>   }
>   
>   static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
> @@ -111,7 +120,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
>   	u16 skb_id = PTP_WQE_CTR2IDX(be16_to_cpu(cqe->wqe_counter));
>   	u16 skb_cc = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
>   	struct mlx5e_txqsq *sq = &ptpsq->txqsq;
> -	struct sk_buff *skb;
> +	struct sk_buff *skb = NULL;
>   	ktime_t hwtstamp;
>   
>   	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
> @@ -120,8 +129,10 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
>   		goto out;
>   	}
>   
> -	if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_cc, skb_id))
> -		mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_cc, skb_id, budget);
> +	if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_cc, skb_id) &&
> +	    !mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_cc, skb_id, budget)) {
> +		goto out;
> +	}
>   
>   	skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
>   	hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe));
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> index 15a5a57b47b8..6e559b856afb 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> @@ -289,14 +289,19 @@ struct sk_buff **mlx5e_skb_fifo_get(struct mlx5e_skb_fifo *fifo, u16 i)
>   static inline
>   void mlx5e_skb_fifo_push(struct mlx5e_skb_fifo *fifo, struct sk_buff *skb)
>   {
> -	struct sk_buff **skb_item = mlx5e_skb_fifo_get(fifo, (*fifo->pc)++);
> +	struct sk_buff **skb_item;
>   
> +	WARN_ONCE(mlx5e_skb_fifo_has_room(fifo), "ptp fifo overflow");
> +	skb_item = mlx5e_skb_fifo_get(fifo, (*fifo->pc)++);
>   	*skb_item = skb;
>   }
>   
>   static inline
>   struct sk_buff *mlx5e_skb_fifo_pop(struct mlx5e_skb_fifo *fifo)
>   {
> +	if (*fifo->pc == *fifo->cc)
> +		return NULL;
> +
>   	return *mlx5e_skb_fifo_get(fifo, (*fifo->cc)++);
>   }
>   

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ