lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALzJLG-rNgKSroZg5+XsHRF8vPb=+b4V2cSchAz3c6+cvOW96w@mail.gmail.com>
Date:   Thu, 9 Feb 2017 19:15:59 +0200
From:   Saeed Mahameed <saeedm@....mellanox.co.il>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     "David S . Miller" <davem@...emloft.net>,
        netdev <netdev@...r.kernel.org>,
        Tariq Toukan <tariqt@...lanox.com>,
        Martin KaFai Lau <kafai@...com>,
        Willem de Bruijn <willemb@...gle.com>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Brenden Blanco <bblanco@...mgrid.com>,
        Alexei Starovoitov <ast@...nel.org>,
        Eric Dumazet <eric.dumazet@...il.com>
Subject: Re: [PATCH v2 net-next 14/14] mlx4: remove duplicate code in mlx4_en_process_rx_cq()

On Thu, Feb 9, 2017 at 3:58 PM, Eric Dumazet <edumazet@...gle.com> wrote:
> We should keep one way to build skbs, regardless of GRO being on or off.
>
> Note that I made sure to defer as much as possible the point we need to
> pull data from the frame, so that future prefetch() we might add
> are more effective.
>
> These skb attributes derive from the CQE or ring :
>  ip_summed, csum
>  hash
>  vlan offload
>  hwtstamps
>  queue_mapping
>
> As a bonus, this patch removes mlx4 dependency on eth_get_headlen()

So no copy at all in driver RX ?  as it is handled in
napi_gro_frags(&cq->napi); ?

General question, why not just use build_skb() ? which approach is better ?

> which is very often broken enough to give us headaches.
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_rx.c | 203 ++++++-----------------------
>  1 file changed, 37 insertions(+), 166 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index 0a87cc96e90ce7374821a0b4712d4b85ad8e..ef6c295789803a69f7066f7e5c80d1dc37f2 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -526,64 +526,6 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>         return 0;
>  }
>
> -
> -static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
> -                                     struct mlx4_en_rx_alloc *frags,
> -                                     unsigned int length)
> -{
> -       struct sk_buff *skb;
> -       void *va;
> -       int used_frags;
> -       dma_addr_t dma;
> -
> -       skb = netdev_alloc_skb(priv->dev, SMALL_PACKET_SIZE + NET_IP_ALIGN);
> -       if (unlikely(!skb)) {
> -               en_dbg(RX_ERR, priv, "Failed allocating skb\n");
> -               return NULL;
> -       }
> -       skb_reserve(skb, NET_IP_ALIGN);
> -       skb->len = length;
> -
> -       /* Get pointer to first fragment so we could copy the headers into the
> -        * (linear part of the) skb */
> -       va = page_address(frags[0].page) + frags[0].page_offset;
> -
> -       if (length <= SMALL_PACKET_SIZE) {
> -               /* We are copying all relevant data to the skb - temporarily
> -                * sync buffers for the copy */
> -
> -               dma = frags[0].dma + frags[0].page_offset;
> -               dma_sync_single_for_cpu(priv->ddev, dma, length,
> -                                       DMA_FROM_DEVICE);
> -               skb_copy_to_linear_data(skb, va, length);
> -               skb->tail += length;
> -       } else {
> -               unsigned int pull_len;
> -
> -               /* Move relevant fragments to skb */
> -               used_frags = mlx4_en_complete_rx_desc(priv, frags,
> -                                                       skb, length);
> -               if (unlikely(!used_frags)) {
> -                       kfree_skb(skb);
> -                       return NULL;
> -               }
> -               skb_shinfo(skb)->nr_frags = used_frags;
> -
> -               pull_len = eth_get_headlen(va, SMALL_PACKET_SIZE);
> -               /* Copy headers into the skb linear buffer */
> -               memcpy(skb->data, va, pull_len);
> -               skb->tail += pull_len;
> -
> -               /* Skip headers in first fragment */
> -               skb_shinfo(skb)->frags[0].page_offset += pull_len;
> -
> -               /* Adjust size of first fragment */
> -               skb_frag_size_sub(&skb_shinfo(skb)->frags[0], pull_len);
> -               skb->data_len = length - pull_len;
> -       }
> -       return skb;
> -}
> -
>  static void validate_loopback(struct mlx4_en_priv *priv, void *va)
>  {
>         const unsigned char *data = va + ETH_HLEN;
> @@ -792,8 +734,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>                  */
>                 length = be32_to_cpu(cqe->byte_cnt);
>                 length -= ring->fcs_del;
> -               l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
> -                       (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
>
>                 /* A bpf program gets first chance to drop the packet. It may
>                  * read bytes but not past the end of the frag.
> @@ -849,122 +789,51 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>                 ring->bytes += length;
>                 ring->packets++;
>
> +               skb = napi_get_frags(&cq->napi);
> +               if (!skb)
> +                       goto next;
> +
> +               if (unlikely(ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL)) {
> +                       timestamp = mlx4_en_get_cqe_ts(cqe);
> +                       mlx4_en_fill_hwtstamps(mdev, skb_hwtstamps(skb),
> +                                              timestamp);
> +               }
> +               skb_record_rx_queue(skb, cq->ring);
> +
>                 if (likely(dev->features & NETIF_F_RXCSUM)) {
>                         if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
>                                                       MLX4_CQE_STATUS_UDP)) {
>                                 if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
>                                     cqe->checksum == cpu_to_be16(0xffff)) {
>                                         ip_summed = CHECKSUM_UNNECESSARY;
> +                                       l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
> +                                               (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
> +                                       if (l2_tunnel)
> +                                               skb->csum_level = 1;
>                                         ring->csum_ok++;
>                                 } else {
> -                                       ip_summed = CHECKSUM_NONE;
> -                                       ring->csum_none++;
> +                                       goto csum_none;
>                                 }
>                         } else {
>                                 if (priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
>                                     (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
>                                                                MLX4_CQE_STATUS_IPV6))) {
> -                                       ip_summed = CHECKSUM_COMPLETE;
> -                                       ring->csum_complete++;
> +                                       if (check_csum(cqe, skb, va, dev->features)) {
> +                                               goto csum_none;
> +                                       } else {
> +                                               ip_summed = CHECKSUM_COMPLETE;
> +                                               ring->csum_complete++;
> +                                       }
>                                 } else {
> -                                       ip_summed = CHECKSUM_NONE;
> -                                       ring->csum_none++;
> +                                       goto csum_none;
>                                 }
>                         }
>                 } else {
> +csum_none:
>                         ip_summed = CHECKSUM_NONE;
>                         ring->csum_none++;
>                 }

Why just not move this whole csum logic to a dedicated function ?
all it needs to do is update some csum statistics and determine
"ip_summed" value;

> -
> -               /* This packet is eligible for GRO if it is:
> -                * - DIX Ethernet (type interpretation)
> -                * - TCP/IP (v4)
> -                * - without IP options
> -                * - not an IP fragment
> -                */
> -               if (dev->features & NETIF_F_GRO) {
> -                       struct sk_buff *gro_skb = napi_get_frags(&cq->napi);
> -                       if (!gro_skb)
> -                               goto next;
> -
> -                       nr = mlx4_en_complete_rx_desc(priv, frags, gro_skb,
> -                                                     length);
> -                       if (!nr)
> -                               goto next;
> -
> -                       if (ip_summed == CHECKSUM_COMPLETE) {
> -                               if (check_csum(cqe, gro_skb, va,
> -                                              dev->features)) {
> -                                       ip_summed = CHECKSUM_NONE;
> -                                       ring->csum_none++;
> -                                       ring->csum_complete--;
> -                               }
> -                       }
> -
> -                       skb_shinfo(gro_skb)->nr_frags = nr;
> -                       gro_skb->len = length;
> -                       gro_skb->data_len = length;
> -                       gro_skb->ip_summed = ip_summed;
> -
> -                       if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
> -                               gro_skb->csum_level = 1;
> -
> -                       if ((cqe->vlan_my_qpn &
> -                           cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
> -                           (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
> -                               u16 vid = be16_to_cpu(cqe->sl_vid);
> -
> -                               __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
> -                       } else if ((be32_to_cpu(cqe->vlan_my_qpn) &
> -                                 MLX4_CQE_SVLAN_PRESENT_MASK) &&
> -                                (dev->features & NETIF_F_HW_VLAN_STAG_RX)) {
> -                               __vlan_hwaccel_put_tag(gro_skb,
> -                                                      htons(ETH_P_8021AD),
> -                                                      be16_to_cpu(cqe->sl_vid));
> -                       }
> -
> -                       if (dev->features & NETIF_F_RXHASH)
> -                               skb_set_hash(gro_skb,
> -                                            be32_to_cpu(cqe->immed_rss_invalid),
> -                                            (ip_summed == CHECKSUM_UNNECESSARY) ?
> -                                               PKT_HASH_TYPE_L4 :
> -                                               PKT_HASH_TYPE_L3);
> -
> -                       skb_record_rx_queue(gro_skb, cq->ring);
> -
> -                       if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
> -                               timestamp = mlx4_en_get_cqe_ts(cqe);
> -                               mlx4_en_fill_hwtstamps(mdev,
> -                                                      skb_hwtstamps(gro_skb),
> -                                                      timestamp);
> -                       }
> -
> -                       napi_gro_frags(&cq->napi);
> -                       goto next;
> -               }
> -
> -               /* GRO not possible, complete processing here */
> -               skb = mlx4_en_rx_skb(priv, frags, length);
> -               if (unlikely(!skb)) {
> -                       ring->dropped++;
> -                       goto next;
> -               }
> -
> -               if (ip_summed == CHECKSUM_COMPLETE) {
> -                       if (check_csum(cqe, skb, va, dev->features)) {
> -                               ip_summed = CHECKSUM_NONE;
> -                               ring->csum_complete--;
> -                               ring->csum_none++;
> -                       }
> -               }
> -

Finally! Best patch ever !
Thank you Eric.

>                 skb->ip_summed = ip_summed;
> -               skb->protocol = eth_type_trans(skb, dev);

Not required anymore ?  I know napi_frags_skb does the job,
but what about skb->pkt_type field which is handled in eth_type_trans ?

> -               skb_record_rx_queue(skb, cq->ring);
> -
> -               if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
> -                       skb->csum_level = 1;
> -
>                 if (dev->features & NETIF_F_RXHASH)
>                         skb_set_hash(skb,
>                                      be32_to_cpu(cqe->immed_rss_invalid),
> @@ -972,23 +841,25 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>                                         PKT_HASH_TYPE_L4 :
>                                         PKT_HASH_TYPE_L3);
>
> -               if ((be32_to_cpu(cqe->vlan_my_qpn) &
> -                   MLX4_CQE_CVLAN_PRESENT_MASK) &&
> +
> +               if ((cqe->vlan_my_qpn &
> +                    cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
>                     (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
> -                       __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
> -               else if ((be32_to_cpu(cqe->vlan_my_qpn) &
> -                         MLX4_CQE_SVLAN_PRESENT_MASK) &&
> +                       __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
> +                                              be16_to_cpu(cqe->sl_vid));
> +               else if ((cqe->vlan_my_qpn &
> +                         cpu_to_be32(MLX4_CQE_SVLAN_PRESENT_MASK)) &&
>                          (dev->features & NETIF_F_HW_VLAN_STAG_RX))
>                         __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
>                                                be16_to_cpu(cqe->sl_vid));
>
> -               if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
> -                       timestamp = mlx4_en_get_cqe_ts(cqe);
> -                       mlx4_en_fill_hwtstamps(mdev, skb_hwtstamps(skb),
> -                                              timestamp);
> +               nr = mlx4_en_complete_rx_desc(priv, frags, skb, length);
> +               if (nr) {

likely() ?

> +                       skb_shinfo(skb)->nr_frags = nr;
> +                       skb->len = length;
> +                       skb->data_len = length;
> +                       napi_gro_frags(&cq->napi);
>                 }

no need to kfree_skb/reset some SKB fields in case nr == 0 ?
since this SKB will be returned via napi_get_frags if you didn't
actually consume it ..

for example 1st packet is a valn packet and for some reason
mlx4_en_complete_rx_desc returned 0 on it.
next packet is a non vlan packet that will get the same skb of the 1st
packet with non clean SKB fields such as (skb->vlan_tci) !

> -
> -               napi_gro_receive(&cq->napi, skb);
>  next:
>                 ++cq->mcq.cons_index;
>                 index = (cq->mcq.cons_index) & ring->size_mask;
> --
> 2.11.0.483.g087da7b7c-goog
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ