netdev - Re: [Intel-wired-lan] [PATCH iwl-next v7 09/12] iavf: refactor iavf_clean_rx_irq to support legacy and flex descriptors

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <94cf16ed-709b-4cab-9325-52670db25902@intel.com>
Date: Tue, 11 Jun 2024 13:47:46 +0200
From: Alexander Lobakin <aleksander.lobakin@...el.com>
To: Mateusz Polchlopek <mateusz.polchlopek@...el.com>, Jacob Keller
	<jacob.e.keller@...el.com>
CC: <intel-wired-lan@...ts.osuosl.org>, <netdev@...r.kernel.org>, "Wojciech
 Drewek" <wojciech.drewek@...el.com>
Subject: Re: [Intel-wired-lan] [PATCH iwl-next v7 09/12] iavf: refactor
 iavf_clean_rx_irq to support legacy and flex descriptors

From: Mateusz Polchlopek <mateusz.polchlopek@...el.com>
Date: Tue,  4 Jun 2024 09:13:57 -0400

> From: Jacob Keller <jacob.e.keller@...el.com>
> 
> Using VIRTCHNL_VF_OFFLOAD_FLEX_DESC, the iAVF driver is capable of
> negotiating to enable the advanced flexible descriptor layout. Add the
> flexible NIC layout (RXDID=2) as a member of the Rx descriptor union.
> 
> Also add bit position definitions for the status and error indications
> that are needed.
> 
> The iavf_clean_rx_irq function needs to extract a few fields from the Rx
> descriptor, including the size, rx_ptype, and vlan_tag.
> Move the extraction to a separate function that decodes the fields into
> a structure. This will reduce the burden for handling multiple
> descriptor types by keeping the relevant extraction logic in one place.
> 
> To support handling an additional descriptor format with minimal code
> duplication, refactor Rx checksum handling so that the general logic
> is separated from the bit calculations. Introduce an iavf_rx_desc_decoded
> structure which holds the relevant bits decoded from the Rx descriptor.
> This will enable implementing flexible descriptor handling without
> duplicating the general logic twice.
> 
> Introduce an iavf_extract_flex_rx_fields, iavf_flex_rx_hash, and
> iavf_flex_rx_csum functions which operate on the flexible NIC descriptor
> format instead of the legacy 32 byte format. Based on the negotiated
> RXDID, select the correct function for processing the Rx descriptors.
> 
> With this change, the Rx hot path should be functional when using either
> the default legacy 32byte format or when we switch to the flexible NIC
> layout.
> 
> Modify the Rx hot path to add support for the flexible descriptor
> format and add request enabling Rx timestamps for all queues.
> 
> As in ice, make sure we bump the checksum level if the hardware detected
> a packet type which could have an outer checksum. This is important
> because hardware only verifies the inner checksum.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@...el.com>
> Reviewed-by: Wojciech Drewek <wojciech.drewek@...el.com>
> Co-developed-by: Mateusz Polchlopek <mateusz.polchlopek@...el.com>
> Signed-off-by: Mateusz Polchlopek <mateusz.polchlopek@...el.com>
> ---
>  drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 354 +++++++++++++-----
>  drivers/net/ethernet/intel/iavf/iavf_txrx.h   |   8 +
>  drivers/net/ethernet/intel/iavf/iavf_type.h   | 147 ++++++--
>  .../net/ethernet/intel/iavf/iavf_virtchnl.c   |   5 +
>  4 files changed, 391 insertions(+), 123 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> index 26b424fd6718..97da5af52ad7 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> @@ -895,63 +895,68 @@ bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
>  	return true;
>  }
>  
> +/* iavf_rx_csum_decoded
> + *
> + * Checksum offload bits decoded from the receive descriptor.
> + */
> +struct iavf_rx_csum_decoded {
> +	u8 l3l4p : 1;
> +	u8 ipe : 1;
> +	u8 eipe : 1;
> +	u8 eudpe : 1;
> +	u8 ipv6exadd : 1;
> +	u8 l4e : 1;
> +	u8 pprs : 1;
> +	u8 nat : 1;
> +};

I see the same struct in idpf, probably a candidate for libeth.

> +
>  /**
> - * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum
> + * iavf_rx_csum - Indicate in skb if hw indicated a good checksum
>   * @vsi: the VSI we care about
>   * @skb: skb currently being received and modified
> - * @rx_desc: the receive descriptor
> + * @ptype: decoded ptype information
> + * @csum_bits: decoded Rx descriptor information
>   **/
> -static void iavf_rx_checksum(struct iavf_vsi *vsi,
> -			     struct sk_buff *skb,
> -			     union iavf_rx_desc *rx_desc)
> +static void iavf_rx_csum(struct iavf_vsi *vsi, struct sk_buff *skb,

Can't @vsi be const?

> +			 struct libeth_rx_pt *ptype,

struct libeth_rx_pt is smaller than a pointer to it. Pass it directly

> +			 struct iavf_rx_csum_decoded *csum_bits)

Same for this struct.

>  {
> -	struct libeth_rx_pt decoded;
> -	u32 rx_error, rx_status;
>  	bool ipv4, ipv6;
> -	u8 ptype;
> -	u64 qword;
>  
>  	skb->ip_summed = CHECKSUM_NONE;
>  
> -	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> -	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
> -
> -	decoded = libie_rx_pt_parse(ptype);
> -	if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
> -		return;
> -
> -	rx_error = FIELD_GET(IAVF_RXD_QW1_ERROR_MASK, qword);
> -	rx_status = FIELD_GET(IAVF_RXD_QW1_STATUS_MASK, qword);
> -
>  	/* did the hardware decode the packet and checksum? */
> -	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
> +	if (!csum_bits->l3l4p)
>  		return;
>  
> -	ipv4 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV4;
> -	ipv6 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV6;
> +	ipv4 = libeth_rx_pt_get_ip_ver(*ptype) == LIBETH_RX_PT_OUTER_IPV4;
> +	ipv6 = libeth_rx_pt_get_ip_ver(*ptype) == LIBETH_RX_PT_OUTER_IPV6;
>  
> -	if (ipv4 &&
> -	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
> -			 BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
> +	if (ipv4 && (csum_bits->ipe || csum_bits->eipe))
>  		goto checksum_fail;
>  
>  	/* likely incorrect csum if alternate IP extension headers found */
> -	if (ipv6 &&
> -	    rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT))
> -		/* don't increment checksum err here, non-fatal err */
> +	if (ipv6 && csum_bits->ipv6exadd)
>  		return;
>  
>  	/* there was some L4 error, count error and punt packet to the stack */
> -	if (rx_error & BIT(IAVF_RX_DESC_ERROR_L4E_SHIFT))
> +	if (csum_bits->l4e)
>  		goto checksum_fail;
>  
>  	/* handle packets that were not able to be checksummed due
>  	 * to arrival speed, in this case the stack can compute
>  	 * the csum.
>  	 */
> -	if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
> +	if (csum_bits->pprs)
>  		return;
>  
> +	/* If there is an outer header present that might contain a checksum
> +	 * we need to bump the checksum level by 1 to reflect the fact that
> +	 * we are indicating we validated the inner checksum.
> +	 */
> +	if (ptype->tunnel_type >= LIBETH_RX_PT_TUNNEL_IP_GRENAT)
> +		skb->csum_level = 1;
> +
>  	skb->ip_summed = CHECKSUM_UNNECESSARY;
>  	return;

For the whole function: you need to use unlikely() for checksum errors
to not slow down regular frames.
Also, I would even unlikely() packets with not verified checksum as it's
really rare case.
Optimize hotpath for most common workloads.

>  
> @@ -960,22 +965,105 @@ static void iavf_rx_checksum(struct iavf_vsi *vsi,
>  }
>  
>  /**
> - * iavf_rx_hash - set the hash value in the skb
> + * iavf_legacy_rx_csum - Indicate in skb if hw indicated a good cksum
> + * @vsi: the VSI we care about
> + * @skb: skb currently being received and modified
> + * @rx_desc: the receive descriptor
> + *
> + * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte
> + * descriptor writeback format.
> + **/
> +static void iavf_legacy_rx_csum(struct iavf_vsi *vsi,
> +				struct sk_buff *skb,
> +				union iavf_rx_desc *rx_desc)

@vsi and @rx_desc can be const.

> +{
> +	struct iavf_rx_csum_decoded csum_bits;
> +	struct libeth_rx_pt decoded;
> +
> +	u32 rx_error;
> +	u64 qword;
> +	u16 ptype;
> +
> +	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> +	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
> +	rx_error = FIELD_GET(IAVF_RXD_QW1_ERROR_MASK, qword);

You don't need @rx_error before libeth_rx_pt_has_checksum().

> +	decoded = libie_rx_pt_parse(ptype);
> +
> +	if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
> +		return;
> +
> +	csum_bits.ipe = FIELD_GET(IAVF_RX_DESC_ERROR_IPE_MASK, rx_error);

So, @rx_error is a field of @qword and then there are more subfields?
Why not extract those fields directly from @qword?

> +	csum_bits.eipe = FIELD_GET(IAVF_RX_DESC_ERROR_EIPE_MASK, rx_error);
> +	csum_bits.l4e = FIELD_GET(IAVF_RX_DESC_ERROR_L4E_MASK, rx_error);
> +	csum_bits.pprs = FIELD_GET(IAVF_RX_DESC_ERROR_PPRS_MASK, rx_error);
> +	csum_bits.l3l4p = FIELD_GET(IAVF_RX_DESC_STATUS_L3L4P_MASK, rx_error);
> +	csum_bits.ipv6exadd = FIELD_GET(IAVF_RX_DESC_STATUS_IPV6EXADD_MASK,
> +					rx_error);
> +	csum_bits.nat = 0;
> +	csum_bits.eudpe = 0;

Initialize the whole struct with = { } at the declaration site and
remove this.

> +
> +	iavf_rx_csum(vsi, skb, &decoded, &csum_bits);

In order to avoid having 2 call sites for this, make
iavf_{flex,legacy}_rx_csum() return @csum_bits and call iavf_rx_csum()
outside of them once.

> +}
> +
> +/**
> + * iavf_flex_rx_csum - Indicate in skb if hw indicated a good cksum
> + * @vsi: the VSI we care about
> + * @skb: skb currently being received and modified
> + * @rx_desc: the receive descriptor
> + *
> + * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible
> + * descriptor writeback format.
> + **/
> +static void iavf_flex_rx_csum(struct iavf_vsi *vsi, struct sk_buff *skb,
> +			      union iavf_rx_desc *rx_desc)

Same for const.

> +{
> +	struct iavf_rx_csum_decoded csum_bits;
> +	struct libeth_rx_pt decoded;
> +	u16 rx_status0, ptype;
> +
> +	rx_status0 = le16_to_cpu(rx_desc->flex_wb.status_error0);

This is not needed before libeth_rx_pt_has_checksum().

> +	ptype = FIELD_GET(IAVF_RX_FLEX_DESC_PTYPE_M,
> +			  le16_to_cpu(rx_desc->flex_wb.ptype_flexi_flags0));

You also access this field later when extracting base fields. Shouldn't
this be combined somehow?

> +	decoded = libie_rx_pt_parse(ptype);
> +
> +	if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
> +		return;
> +
> +	csum_bits.ipe = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_XSUM_IPE_M,
> +				  rx_status0);
> +	csum_bits.eipe = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_XSUM_EIPE_M,
> +				   rx_status0);
> +	csum_bits.l4e = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_XSUM_L4E_M,
> +				  rx_status0);
> +	csum_bits.eudpe = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_M,
> +				    rx_status0);
> +	csum_bits.l3l4p = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_L3L4P_M,
> +				    rx_status0);
> +	csum_bits.ipv6exadd = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS0_IPV6EXADD_M,
> +					rx_status0);
> +	csum_bits.nat = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS1_NAT_M, rx_status0);
> +	csum_bits.pprs = 0;

See above for struct initialization.

> +
> +	iavf_rx_csum(vsi, skb, &decoded, &csum_bits);

See above.

> +}
> +
> +/**
> + * iavf_legacy_rx_hash - set the hash value in the skb
>   * @ring: descriptor ring
>   * @rx_desc: specific descriptor
>   * @skb: skb currently being received and modified
>   * @rx_ptype: Rx packet type
> + *
> + * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte
> + * descriptor writeback format.
>   **/
> -static void iavf_rx_hash(struct iavf_ring *ring,
> -			 union iavf_rx_desc *rx_desc,
> -			 struct sk_buff *skb,
> -			 u8 rx_ptype)
> +static void iavf_legacy_rx_hash(struct iavf_ring *ring,
> +				union iavf_rx_desc *rx_desc,

Const for both.

> +				struct sk_buff *skb, u8 rx_ptype)
>  {
> +	const __le64 rss_mask = cpu_to_le64(IAVF_RX_DESC_STATUS_FLTSTAT_MASK);
>  	struct libeth_rx_pt decoded;
>  	u32 hash;
> -	const __le64 rss_mask =
> -		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
> -			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);

Looks like unrelated, but nice change anyway.

>  
>  	decoded = libie_rx_pt_parse(rx_ptype);
>  	if (!libeth_rx_pt_has_hash(ring->netdev, decoded))
> @@ -987,6 +1075,38 @@ static void iavf_rx_hash(struct iavf_ring *ring,
>  	}
>  }
>  
> +/**
> + * iavf_flex_rx_hash - set the hash value in the skb
> + * @ring: descriptor ring
> + * @rx_desc: specific descriptor
> + * @skb: skb currently being received and modified
> + * @rx_ptype: Rx packet type
> + *
> + * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible
> + * descriptor writeback format.
> + **/
> +static void iavf_flex_rx_hash(struct iavf_ring *ring,
> +			      union iavf_rx_desc *rx_desc,

Const.

> +			      struct sk_buff *skb, u16 rx_ptype)

Why is @rx_ptype u16 here, but u8 above? Just use u32 for both.

> +{
> +	struct libeth_rx_pt decoded;
> +	u16 status0;
> +	u32 hash;
> +
> +	if (!(ring->netdev->features & NETIF_F_RXHASH))

This is checked in libeth_rx_pt_has_hash(), why check 2 times?

> +		return;
> +
> +	decoded = libie_rx_pt_parse(rx_ptype);
> +	if (!libeth_rx_pt_has_hash(ring->netdev, decoded))
> +		return;
> +
> +	status0 = le16_to_cpu(rx_desc->flex_wb.status_error0);
> +	if (status0 & IAVF_RX_FLEX_DESC_STATUS0_RSS_VALID_M) {
> +		hash = le32_to_cpu(rx_desc->flex_wb.rss_hash);
> +		libeth_rx_pt_set_hash(skb, hash, decoded);
> +	}
> +}

Also, just parse rx_ptype once in process_skb_fields(), you don't need
to do that in each function.

> +
>  /**
>   * iavf_process_skb_fields - Populate skb header fields from Rx descriptor
>   * @rx_ring: rx descriptor ring packet is being transacted on
> @@ -998,14 +1118,17 @@ static void iavf_rx_hash(struct iavf_ring *ring,
>   * order to populate the hash, checksum, VLAN, protocol, and
>   * other fields within the skb.
>   **/
> -static void
> -iavf_process_skb_fields(struct iavf_ring *rx_ring,
> -			union iavf_rx_desc *rx_desc, struct sk_buff *skb,
> -			u8 rx_ptype)
> +static void iavf_process_skb_fields(struct iavf_ring *rx_ring,
> +				    union iavf_rx_desc *rx_desc,

Const.

> +				    struct sk_buff *skb, u16 rx_ptype)
>  {
> -	iavf_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
> -
> -	iavf_rx_checksum(rx_ring->vsi, skb, rx_desc);
> +	if (rx_ring->rxdid == VIRTCHNL_RXDID_1_32B_BASE) {

Any likely/unlikely() here? Or it's 50:50?

> +		iavf_legacy_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
> +		iavf_legacy_rx_csum(rx_ring->vsi, skb, rx_desc);
> +	} else {
> +		iavf_flex_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
> +		iavf_flex_rx_csum(rx_ring->vsi, skb, rx_desc);
> +	}
>  
>  	skb_record_rx_queue(skb, rx_ring->queue_index);
>  
> @@ -1092,7 +1215,7 @@ static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer,
>  /**
>   * iavf_is_non_eop - process handling of non-EOP buffers
>   * @rx_ring: Rx ring being processed
> - * @rx_desc: Rx descriptor for current buffer
> + * @fields: Rx descriptor extracted fields
>   * @skb: Current socket buffer containing buffer in progress
>   *
>   * This function updates next to clean.  If the buffer is an EOP buffer
> @@ -1101,7 +1224,7 @@ static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer,
>   * that this is in fact a non-EOP buffer.
>   **/
>  static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
> -			    union iavf_rx_desc *rx_desc,
> +			    struct iavf_rx_extracted *fields,

Pass value instead of pointer.

>  			    struct sk_buff *skb)

Is it still needed?

>  {
>  	u32 ntc = rx_ring->next_to_clean + 1;
> @@ -1113,8 +1236,7 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
>  	prefetch(IAVF_RX_DESC(rx_ring, ntc));
>  
>  	/* if we are the last buffer then there is nothing else to do */
> -#define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
> -	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
> +	if (likely(fields->end_of_packet))
>  		return false;
>  
>  	rx_ring->rx_stats.non_eop_descs++;
> @@ -1122,6 +1244,91 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
>  	return true;
>  }
>  
> +/**
> + * iavf_extract_legacy_rx_fields - Extract fields from the Rx descriptor
> + * @rx_ring: rx descriptor ring
> + * @rx_desc: the descriptor to process
> + * @fields: storage for extracted values
> + *
> + * Decode the Rx descriptor and extract relevant information including the
> + * size, VLAN tag, Rx packet type, end of packet field and RXE field value.
> + *
> + * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte
> + * descriptor writeback format.
> + */
> +static void iavf_extract_legacy_rx_fields(struct iavf_ring *rx_ring,
> +					  union iavf_rx_desc *rx_desc,

Consts.

> +					  struct iavf_rx_extracted *fields)

Return a struct &iavf_rx_extracted instead of passing a pointer to it.

> +{
> +	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> +
> +	fields->size = FIELD_GET(IAVF_RXD_QW1_LENGTH_PBUF_MASK, qword);
> +	fields->rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
> +
> +	if (qword & IAVF_RX_DESC_STATUS_L2TAG1P_MASK &&
> +	    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
> +		fields->vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
> +
> +	if (rx_desc->wb.qword2.ext_status &
> +	    cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&

Bitops must have own pairs of braces.

> +	    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
> +		fields->vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
> +
> +	fields->end_of_packet = FIELD_GET(IAVF_RX_DESC_STATUS_EOF_MASK, qword);
> +	fields->rxe = FIELD_GET(BIT(IAVF_RXD_QW1_ERROR_SHIFT), qword);
> +}
> +
> +/**
> + * iavf_extract_flex_rx_fields - Extract fields from the Rx descriptor
> + * @rx_ring: rx descriptor ring
> + * @rx_desc: the descriptor to process
> + * @fields: storage for extracted values
> + *
> + * Decode the Rx descriptor and extract relevant information including the
> + * size, VLAN tag, Rx packet type, end of packet field and RXE field value.
> + *
> + * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible
> + * descriptor writeback format.
> + */
> +static void iavf_extract_flex_rx_fields(struct iavf_ring *rx_ring,
> +					union iavf_rx_desc *rx_desc,
> +					struct iavf_rx_extracted *fields)

Same for everything.

> +{
> +	u16 status0, status1, flexi_flags0;
> +
> +	fields->size = FIELD_GET(IAVF_RX_FLEX_DESC_PKT_LEN_M,
> +				 le16_to_cpu(rx_desc->flex_wb.pkt_len));

le16_get_bits().

> +
> +	flexi_flags0 = le16_to_cpu(rx_desc->flex_wb.ptype_flexi_flags0);
> +
> +	fields->rx_ptype = FIELD_GET(IAVF_RX_FLEX_DESC_PTYPE_M, flexi_flags0);

le16_get_bits() instead of these two?

> +
> +	status0 = le16_to_cpu(rx_desc->flex_wb.status_error0);
> +	if (status0 & IAVF_RX_FLEX_DESC_STATUS0_L2TAG1P_M &&
> +	    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)

Braces for bitops.

> +		fields->vlan_tag = le16_to_cpu(rx_desc->flex_wb.l2tag1);
> +
> +	status1 = le16_to_cpu(rx_desc->flex_wb.status_error1);
> +	if (status1 & IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_M &&
> +	    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
> +		fields->vlan_tag = le16_to_cpu(rx_desc->flex_wb.l2tag2_2nd);

(same)

> +
> +	fields->end_of_packet = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS_ERR0_EOP_BIT,
> +					  status0);
> +	fields->rxe = FIELD_GET(IAVF_RX_FLEX_DESC_STATUS_ERR0_RXE_BIT,
> +				status0);
> +}
> +
> +static void iavf_extract_rx_fields(struct iavf_ring *rx_ring,
> +				   union iavf_rx_desc *rx_desc,
> +				   struct iavf_rx_extracted *fields)

Consts + return struct @fields directly.

> +{
> +	if (rx_ring->rxdid == VIRTCHNL_RXDID_1_32B_BASE)

You check this several times, this could be combined and optimized.

> +		iavf_extract_legacy_rx_fields(rx_ring, rx_desc, fields);
> +	else
> +		iavf_extract_flex_rx_fields(rx_ring, rx_desc, fields);
> +}
> +
>  /**
>   * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
>   * @rx_ring: rx descriptor ring to transact packets on
> @@ -1142,12 +1349,9 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
>  	bool failure = false;
>  
>  	while (likely(total_rx_packets < (unsigned int)budget)) {
> +		struct iavf_rx_extracted fields = {};
>  		struct libeth_fqe *rx_buffer;
>  		union iavf_rx_desc *rx_desc;
> -		unsigned int size;
> -		u16 vlan_tag = 0;
> -		u8 rx_ptype;
> -		u64 qword;
>  
>  		/* return some buffers to hardware, one at a time is too slow */
>  		if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
> @@ -1158,35 +1362,27 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
>  
>  		rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
>  
> -		/* status_error_len will always be zero for unused descriptors
> -		 * because it's cleared in cleanup, and overlaps with hdr_addr
> -		 * which is always zero because packet split isn't used, if the
> -		 * hardware wrote DD then the length will be non-zero
> -		 */
> -		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> -
>  		/* This memory barrier is needed to keep us from reading
>  		 * any other fields out of the rx_desc until we have
>  		 * verified the descriptor has been written back.
>  		 */
>  		dma_rmb();
> -#define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT)
> -		if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD))
> +		if (!iavf_test_staterr(rx_desc, IAVF_RX_DESC_STATUS_DD_MASK))
>  			break;
>  
> -		size = FIELD_GET(IAVF_RXD_QW1_LENGTH_PBUF_MASK, qword);
> +		iavf_extract_rx_fields(rx_ring, rx_desc, &fields);
>  
>  		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
>  
>  		rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean];
> -		if (!libeth_rx_sync_for_cpu(rx_buffer, size))
> +		if (!libeth_rx_sync_for_cpu(rx_buffer, fields.size))
>  			goto skip_data;
>  
>  		/* retrieve a buffer from the ring */
>  		if (skb)
> -			iavf_add_rx_frag(skb, rx_buffer, size);
> +			iavf_add_rx_frag(skb, rx_buffer, fields.size);
>  		else
> -			skb = iavf_build_skb(rx_buffer, size);
> +			skb = iavf_build_skb(rx_buffer, fields.size);
>  
>  		/* exit if we failed to retrieve a buffer */
>  		if (!skb) {
> @@ -1197,15 +1393,14 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
>  skip_data:
>  		cleaned_count++;
>  
> -		if (iavf_is_non_eop(rx_ring, rx_desc, skb) || unlikely(!skb))
> +		if (iavf_is_non_eop(rx_ring, &fields, skb) || unlikely(!skb))
>  			continue;
>  
> -		/* ERR_MASK will only have valid bits if EOP set, and
> -		 * what we are doing here is actually checking
> -		 * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
> -		 * the error field
> +		/* RXE field in descriptor is an indication of the MAC errors
> +		 * (like CRC, alignment, oversize etc). If it is set then iavf
> +		 * should finish.
>  		 */
> -		if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) {
> +		if (unlikely(fields.rxe)) {
>  			dev_kfree_skb_any(skb);
>  			skb = NULL;
>  			continue;
> @@ -1219,22 +1414,11 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
>  		/* probably a little skewed due to removing CRC */
>  		total_rx_bytes += skb->len;
>  
> -		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> -		rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
> -
>  		/* populate checksum, VLAN, and protocol */
> -		iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
> -
> -		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
> -		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
> -			vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
> -		if (rx_desc->wb.qword2.ext_status &
> -		    cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&
> -		    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
> -			vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);

BTW I'm wondering whether filling the whole @fields can be less
optimized than accesssing descriptor fields one by one like it was here
before.
I mean, in some cases you won't need all the fields from the extracted
struct, but they will be read and initialized anyway.

> +		iavf_process_skb_fields(rx_ring, rx_desc, skb, fields.rx_ptype);
>  
>  		iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
> -		iavf_receive_skb(rx_ring, skb, vlan_tag);
> +		iavf_receive_skb(rx_ring, skb, fields.vlan_tag);
>  		skb = NULL;
>  
>  		/* update budget accounting */
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
> index 17309d8625ac..3661cd57a068 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
> +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
> @@ -99,6 +99,14 @@ static inline bool iavf_test_staterr(union iavf_rx_desc *rx_desc,
>  		  cpu_to_le64(stat_err_bits));
>  }
>  
> +struct iavf_rx_extracted {
> +	unsigned int size;
> +	u16 vlan_tag;
> +	u16 rx_ptype;
> +	u8 end_of_packet:1;
> +	u8 rxe:1;
> +};

Also something libethish, but not sure for this one.

> +
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
>  #define IAVF_RX_INCREMENT(r, i) \
>  	do {					\
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
> index f6b09e57abce..82c16a720807 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_type.h
> +++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
> @@ -206,6 +206,45 @@ union iavf_16byte_rx_desc {
>  	} wb;  /* writeback */
>  };
>  
> +/* Rx Flex Descriptor NIC Profile
> + * RxDID Profile ID 2
> + * Flex-field 0: RSS hash lower 16-bits
> + * Flex-field 1: RSS hash upper 16-bits
> + * Flex-field 2: Flow ID lower 16-bits
> + * Flex-field 3: Flow ID higher 16-bits
> + * Flex-field 4: reserved, VLAN ID taken from L2Tag
> + */
> +struct iavf_32byte_rx_flex_wb {
> +	/* Qword 0 */
> +	u8 rxdid;
> +	u8 mir_id_umb_cast;
> +	__le16 ptype_flexi_flags0;
> +	__le16 pkt_len;
> +	__le16 hdr_len_sph_flex_flags1;
> +
> +	/* Qword 1 */
> +	__le16 status_error0;
> +	__le16 l2tag1;
> +	__le32 rss_hash;
> +
> +	/* Qword 2 */
> +	__le16 status_error1;
> +	u8 flexi_flags2;
> +	u8 ts_low;
> +	__le16 l2tag2_1st;
> +	__le16 l2tag2_2nd;
> +
> +	/* Qword 3 */
> +	__le32 flow_id;
> +	union {
> +		struct {
> +			__le16 rsvd;
> +			__le16 flow_id_ipv6;
> +		} flex;
> +		__le32 ts_high;
> +	} flex_ts;
> +};

I'm wondering whether HW descriptors can be defined as just a bunch of
u64 qwords instead of all those u8/__le16 etc. fields. That would be faster.
In case this would work differently on BE and LE, #ifdefs.

> +
>  union iavf_32byte_rx_desc {
>  	struct {
>  		__le64  pkt_addr; /* Packet buffer address */
> @@ -253,35 +292,34 @@ union iavf_32byte_rx_desc {
>  			} hi_dword;
>  		} qword3;
>  	} wb;  /* writeback */
> +	struct iavf_32byte_rx_flex_wb flex_wb;

So, already existing formats were described here directly, but flex is
declared as a field? Can this be more consistent?

>  };
>  
> -enum iavf_rx_desc_status_bits {
> -	/* Note: These are predefined bit offsets */
> -	IAVF_RX_DESC_STATUS_DD_SHIFT		= 0,
> -	IAVF_RX_DESC_STATUS_EOF_SHIFT		= 1,
> -	IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT	= 2,
> -	IAVF_RX_DESC_STATUS_L3L4P_SHIFT		= 3,
> -	IAVF_RX_DESC_STATUS_CRCP_SHIFT		= 4,
> -	IAVF_RX_DESC_STATUS_TSYNINDX_SHIFT	= 5, /* 2 BITS */
> -	IAVF_RX_DESC_STATUS_TSYNVALID_SHIFT	= 7,
> -	/* Note: Bit 8 is reserved in X710 and XL710 */
> -	IAVF_RX_DESC_STATUS_EXT_UDP_0_SHIFT	= 8,
> -	IAVF_RX_DESC_STATUS_UMBCAST_SHIFT	= 9, /* 2 BITS */
> -	IAVF_RX_DESC_STATUS_FLM_SHIFT		= 11,
> -	IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT	= 12, /* 2 BITS */
> -	IAVF_RX_DESC_STATUS_LPBK_SHIFT		= 14,
> -	IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT	= 15,
> -	IAVF_RX_DESC_STATUS_RESERVED_SHIFT	= 16, /* 2 BITS */
> -	/* Note: For non-tunnel packets INT_UDP_0 is the right status for
> -	 * UDP header
> -	 */
> -	IAVF_RX_DESC_STATUS_INT_UDP_0_SHIFT	= 18,
> -	IAVF_RX_DESC_STATUS_LAST /* this entry must be last!!! */
> -};
> +/* Note: These are predefined bit offsets */
> +#define IAVF_RX_DESC_STATUS_DD_MASK		BIT(0)
> +#define IAVF_RX_DESC_STATUS_EOF_MASK		BIT(1)
> +#define IAVF_RX_DESC_STATUS_L2TAG1P_MASK	BIT(2)
> +#define IAVF_RX_DESC_STATUS_L3L4P_MASK		BIT(3)
> +#define IAVF_RX_DESC_STATUS_CRCP_MASK		BIT(4)
> +#define IAVF_RX_DESC_STATUS_TSYNINDX_MASK	GENMASK_ULL(6, 5)
> +#define IAVF_RX_DESC_STATUS_TSYNVALID_MASK	BIT(7)
> +/* Note: Bit 8 is reserved in X710 and XL710 */
> +#define IAVF_RX_DESC_STATUS_EXT_UDP_0_MASK	BIT(8)
> +#define IAVF_RX_DESC_STATUS_UMBCAST_MASK	GENMASK_ULL(10, 9)
> +#define IAVF_RX_DESC_STATUS_FLM_MASK		BIT(11)
> +#define IAVF_RX_DESC_STATUS_FLTSTAT_MASK	GENMASK_ULL(13, 12)
> +#define IAVF_RX_DESC_STATUS_LPBK_MASK		BIT(14)
> +#define IAVF_RX_DESC_STATUS_IPV6EXADD_MASK	BIT(15)
> +#define IAVF_RX_DESC_STATUS_RESERVED_MASK	GENMASK_ULL(17, 16)
> +/* Note: For non-tunnel packets INT_UDP_0 is the right status for
> + * UDP header
> + */
> +#define IAVF_RX_DESC_STATUS_INT_UDP_0_MASK	BIT(18)
> +
> +#define IAVF_RX_FLEX_DESC_STATUS_ERR0_EOP_BIT	BIT(1)
> +#define IAVF_RX_FLEX_DESC_STATUS_ERR0_RXE_BIT	BIT(10)
>  
> -#define IAVF_RXD_QW1_STATUS_SHIFT	0
> -#define IAVF_RXD_QW1_STATUS_MASK	((BIT(IAVF_RX_DESC_STATUS_LAST) - 1) \
> -					 << IAVF_RXD_QW1_STATUS_SHIFT)
> +#define IAVF_RXD_QW1_STATUS_MASK		(BIT(19) - 1)

GENMASK().

>  
>  #define IAVF_RXD_QW1_STATUS_TSYNINDX_SHIFT IAVF_RX_DESC_STATUS_TSYNINDX_SHIFT
>  #define IAVF_RXD_QW1_STATUS_TSYNINDX_MASK  (0x3UL << \
> @@ -301,18 +339,16 @@ enum iavf_rx_desc_fltstat_values {
>  #define IAVF_RXD_QW1_ERROR_SHIFT	19
>  #define IAVF_RXD_QW1_ERROR_MASK		(0xFFUL << IAVF_RXD_QW1_ERROR_SHIFT)
>  
> -enum iavf_rx_desc_error_bits {
> -	/* Note: These are predefined bit offsets */
> -	IAVF_RX_DESC_ERROR_RXE_SHIFT		= 0,
> -	IAVF_RX_DESC_ERROR_RECIPE_SHIFT		= 1,
> -	IAVF_RX_DESC_ERROR_HBO_SHIFT		= 2,
> -	IAVF_RX_DESC_ERROR_L3L4E_SHIFT		= 3, /* 3 BITS */
> -	IAVF_RX_DESC_ERROR_IPE_SHIFT		= 3,
> -	IAVF_RX_DESC_ERROR_L4E_SHIFT		= 4,
> -	IAVF_RX_DESC_ERROR_EIPE_SHIFT		= 5,
> -	IAVF_RX_DESC_ERROR_OVERSIZE_SHIFT	= 6,
> -	IAVF_RX_DESC_ERROR_PPRS_SHIFT		= 7
> -};
> +/* Note: These are predefined bit offsets */
> +#define IAVF_RX_DESC_ERROR_RXE_MASK		BIT(0)
> +#define IAVF_RX_DESC_ERROR_RECIPE_MASK		BIT(1)
> +#define IAVF_RX_DESC_ERROR_HBO_MASK		BIT(2)
> +#define IAVF_RX_DESC_ERROR_L3L4E_MASK		GENMASK_ULL(5, 3)
> +#define IAVF_RX_DESC_ERROR_IPE_MASK		BIT(3)
> +#define IAVF_RX_DESC_ERROR_L4E_MASK		BIT(4)
> +#define IAVF_RX_DESC_ERROR_EIPE_MASK		BIT(5)
> +#define IAVF_RX_DESC_ERROR_OVERSIZE_MASK	BIT(6)
> +#define IAVF_RX_DESC_ERROR_PPRS_MASK		BIT(7)
>  
>  enum iavf_rx_desc_error_l3l4e_fcoe_masks {
>  	IAVF_RX_DESC_ERROR_L3L4E_NONE		= 0,
> @@ -325,6 +361,41 @@ enum iavf_rx_desc_error_l3l4e_fcoe_masks {
>  #define IAVF_RXD_QW1_PTYPE_SHIFT	30
>  #define IAVF_RXD_QW1_PTYPE_MASK		(0xFFULL << IAVF_RXD_QW1_PTYPE_SHIFT)
>  
> +/* for iavf_32byte_rx_flex_wb.ptype_flexi_flags0 member */
> +#define IAVF_RX_FLEX_DESC_PTYPE_M      (0x3FF) /* 10-bits */

Redundant braces + GENMASK()

> +
> +/* for iavf_32byte_rx_flex_wb.pkt_length member */
> +#define IAVF_RX_FLEX_DESC_PKT_LEN_M    (0x3FFF) /* 14-bits */

Same.

> +
> +/* Note: These are predefined bit offsets */
> +#define IAVF_RX_FLEX_DESC_STATUS0_DD_M			BIT(0)
> +#define IAVF_RX_FLEX_DESC_STATUS0_EOF_M			BIT(1)
> +#define IAVF_RX_FLEX_DESC_STATUS0_HBO_M			BIT(2)
> +#define IAVF_RX_FLEX_DESC_STATUS0_L3L4P_M		BIT(3)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XSUM_IPE_M		BIT(4)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XSUM_L4E_M		BIT(5)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XSUM_EIPE_M		BIT(6)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_M		BIT(7)
> +#define IAVF_RX_FLEX_DESC_STATUS0_LPBK_M		BIT(8)
> +#define IAVF_RX_FLEX_DESC_STATUS0_IPV6EXADD_M		BIT(9)
> +#define IAVF_RX_FLEX_DESC_STATUS0_RXE_M			BIT(10)
> +#define IAVF_RX_FLEX_DESC_STATUS0_CRCP_			BIT(11)
> +#define IAVF_RX_FLEX_DESC_STATUS0_RSS_VALID_M		BIT(12)
> +#define IAVF_RX_FLEX_DESC_STATUS0_L2TAG1P_M		BIT(13)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XTRMD0_VALID_M	BIT(14)
> +#define IAVF_RX_FLEX_DESC_STATUS0_XTRMD1_VALID_M	BIT(15)
> +
> +/* Note: These are predefined bit offsets */
> +#define IAVF_RX_FLEX_DESC_STATUS1_CPM_M			(0xFULL) /* 4 bits */

Redundant braces.
+ GENMASK_ULL(7, 0)?

> +#define IAVF_RX_FLEX_DESC_STATUS1_NAT_M			BIT(4)
> +#define IAVF_RX_FLEX_DESC_STATUS1_CRYPTO_M		BIT(5)
> +/* [10:6] reserved */
> +#define IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_M		BIT(11)
> +#define IAVF_RX_FLEX_DESC_STATUS1_XTRMD2_VALID_M	BIT(12)
> +#define IAVF_RX_FLEX_DESC_STATUS1_XTRMD3_VALID_M	BIT(13)
> +#define IAVF_RX_FLEX_DESC_STATUS1_XTRMD4_VALID_M	BIT(14)
> +#define IAVF_RX_FLEX_DESC_STATUS1_XTRMD5_VALID_M	BIT(15)
> +
>  #define IAVF_RXD_QW1_LENGTH_PBUF_SHIFT	38
>  #define IAVF_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
>  					 IAVF_RXD_QW1_LENGTH_PBUF_SHIFT)
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
> index 2693c3ad0830..5cbb375b7063 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
> +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
> @@ -402,6 +402,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
>  	int pairs = adapter->num_active_queues;
>  	struct virtchnl_queue_pair_info *vqpi;
>  	u32 i, max_frame;
> +	u8 rx_flags = 0;
>  	size_t len;
>  
>  	max_frame = LIBIE_MAX_RX_FRM_LEN(adapter->rx_rings->pp->p.offset);
> @@ -419,6 +420,9 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
>  	if (!vqci)
>  		return;
>  
> +	if (iavf_ptp_cap_supported(adapter, VIRTCHNL_1588_PTP_CAP_RX_TSTAMP))
> +		rx_flags |= VIRTCHNL_PTP_RX_TSTAMP;
> +
>  	vqci->vsi_id = adapter->vsi_res->vsi_id;
>  	vqci->num_queue_pairs = pairs;
>  	vqpi = vqci->qpair;
> @@ -441,6 +445,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
>  		if (CRC_OFFLOAD_ALLOWED(adapter))
>  			vqpi->rxq.crc_disable = !!(adapter->netdev->features &
>  						   NETIF_F_RXFCS);
> +		vqpi->rxq.flags = rx_flags;
>  		vqpi++;
>  	}

Thanks,
Olek