lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 25 Aug 2014 20:42:59 -0700
From:	Tom Herbert <therbert@...gle.com>
To:	David Miller <davem@...emloft.net>
Cc:	Linux Netdev List <netdev@...r.kernel.org>,
	Jamal Hadi Salim <jhs@...atatu.com>,
	Hannes Frederic Sowa <hannes@...essinduktion.org>,
	Eric Dumazet <edumazet@...gle.com>,
	Jeff Kirsher <jeffrey.t.kirsher@...el.com>,
	Rusty Russell <rusty@...tcorp.com.au>,
	Daniel Borkmann <dborkman@...hat.com>, brouer@...hat.com
Subject: Re: [PATCH 1/2] net: Remove ndo_xmit_flush netdev operation, use
 signalling instead.

On Mon, Aug 25, 2014 at 4:35 PM, David Miller <davem@...emloft.net> wrote:
>
> As reported by Jesper Dangaard Brouer, for high packet rates the
> overhead of having another indirect call in the TX path is
> non-trivial.
>
> There is the indirect call itself, and then there is all of the
> reloading of the state to refetch the tail pointer value and
> then write the device register.
>
> Move to a more passive scheme, which requires very light modifications
> to the device drivers.
>
> The signal is a new skb->xmit_more value, if it is non-zero it means
> that more SKBs are pending to be transmitted on the same queue as the
> current SKB.  And therefore, the driver may elide the tail pointer
> update.
>
> Right now skb->xmit_more is always zero.
>
> Signed-off-by: David S. Miller <davem@...emloft.net>
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++--------------------
>  drivers/net/virtio_net.c                  | 12 +++--------
>  include/linux/netdevice.h                 | 25 ++-------------------
>  include/linux/skbuff.h                    |  2 ++
>  4 files changed, 19 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index b9c020a..89c29b4 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -136,7 +136,6 @@ static void igb_update_phy_info(unsigned long);
>  static void igb_watchdog(unsigned long);
>  static void igb_watchdog_task(struct work_struct *);
>  static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue);
>  static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
>                                           struct rtnl_link_stats64 *stats);
>  static int igb_change_mtu(struct net_device *, int);
> @@ -2076,7 +2075,6 @@ static const struct net_device_ops igb_netdev_ops = {
>         .ndo_open               = igb_open,
>         .ndo_stop               = igb_close,
>         .ndo_start_xmit         = igb_xmit_frame,
> -       .ndo_xmit_flush         = igb_xmit_flush,
>         .ndo_get_stats64        = igb_get_stats64,
>         .ndo_set_rx_mode        = igb_set_rx_mode,
>         .ndo_set_mac_address    = igb_set_mac,
> @@ -4917,6 +4915,14 @@ static void igb_tx_map(struct igb_ring *tx_ring,
>
>         tx_ring->next_to_use = i;
>
> +       if (!skb->xmit_more) {
> +               writel(i, tx_ring->tail);
> +
> +               /* we need this if more than one processor can write to our tail
> +                * at a time, it synchronizes IO on IA64/Altix systems
> +                */
> +               mmiowb();
> +       }
>         return;
>
I would suggest the flush should be done if !skb->xmit_more or queue
is being stopped. So maybe pull this code into it's own function e.g.
igb_flush(tx_ring). Then do:

if (igb_maybe_stop_tx(tx_ring, DESC_NEEDED) || !skb->more)
   igb_flush(tx_ring);

>  dma_error:
> @@ -5052,20 +5058,17 @@ out_drop:
>         return NETDEV_TX_OK;
>  }
>
> -static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx)
> +static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> +                                                   struct sk_buff *skb)
>  {
> +       unsigned int r_idx = skb->queue_mapping;
> +
>         if (r_idx >= adapter->num_tx_queues)
>                 r_idx = r_idx % adapter->num_tx_queues;
>
>         return adapter->tx_ring[r_idx];
>  }
>
> -static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> -                                                   struct sk_buff *skb)
> -{
> -       return __igb_tx_queue_mapping(adapter, skb->queue_mapping);
> -}
> -
>  static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>                                   struct net_device *netdev)
>  {
> @@ -5094,21 +5097,6 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>         return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
>  }
>
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue)
> -{
> -       struct igb_adapter *adapter = netdev_priv(netdev);
> -       struct igb_ring *tx_ring;
> -
> -       tx_ring = __igb_tx_queue_mapping(adapter, queue);
> -
> -       writel(tx_ring->next_to_use, tx_ring->tail);
> -
> -       /* we need this if more than one processor can write to our tail
> -        * at a time, it synchronizes IO on IA64/Altix systems
> -        */
> -       mmiowb();
> -}
> -
>  /**
>   *  igb_tx_timeout - Respond to a Tx Hang
>   *  @netdev: network interface device structure
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6242108..f0c2824 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -953,15 +953,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>                 }
>         }
>
> -       return NETDEV_TX_OK;
> -}
> +       if (!skb->xmit_more)
> +               virtqueue_kick(sq->vq);
>
> -static void xmit_flush(struct net_device *dev, u16 qnum)
> -{
> -       struct virtnet_info *vi = netdev_priv(dev);
> -       struct send_queue *sq = &vi->sq[qnum];
> -
> -       virtqueue_kick(sq->vq);
> +       return NETDEV_TX_OK;
>  }
>
>  /*
> @@ -1393,7 +1388,6 @@ static const struct net_device_ops virtnet_netdev = {
>         .ndo_open            = virtnet_open,
>         .ndo_stop            = virtnet_close,
>         .ndo_start_xmit      = start_xmit,
> -       .ndo_xmit_flush      = xmit_flush,
>         .ndo_validate_addr   = eth_validate_addr,
>         .ndo_set_mac_address = virtnet_set_mac_address,
>         .ndo_set_rx_mode     = virtnet_set_rx_mode,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 220c509..039b237 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -782,19 +782,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
>   *     Required can not be NULL.
>   *
> - * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
> - *     A driver implements this function when it wishes to support
> - *     deferred TX queue flushing.  The idea is that the expensive
> - *     operation to trigger TX queue processing can be done after
> - *     N calls to ndo_start_xmit rather than being done every single
> - *     time.  In this regime ndo_start_xmit will be called one or more
> - *     times, and then a final ndo_xmit_flush call will be made to
> - *     have the driver tell the device about the new pending TX queue
> - *     entries.  The kernel keeps track of which queues need flushing
> - *     by monitoring skb->queue_mapping of the packets it submits to
> - *     ndo_start_xmit.  This is the queue value that will be passed
> - *     to ndo_xmit_flush.
> - *
>   * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
>   *                         void *accel_priv, select_queue_fallback_t fallback);
>   *     Called to decide which queue to when device supports multiple
> @@ -1018,7 +1005,6 @@ struct net_device_ops {
>         int                     (*ndo_stop)(struct net_device *dev);
>         netdev_tx_t             (*ndo_start_xmit) (struct sk_buff *skb,
>                                                    struct net_device *dev);
> -       void                    (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
>         u16                     (*ndo_select_queue)(struct net_device *dev,
>                                                     struct sk_buff *skb,
>                                                     void *accel_priv,
> @@ -3447,15 +3433,8 @@ int __init dev_proc_init(void);
>  static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
>                                               struct sk_buff *skb, struct net_device *dev)
>  {
> -       netdev_tx_t ret;
> -       u16 q;
> -
> -       q = skb->queue_mapping;
> -       ret = ops->ndo_start_xmit(skb, dev);
> -       if (dev_xmit_complete(ret) && ops->ndo_xmit_flush)
> -               ops->ndo_xmit_flush(dev, q);
> -
> -       return ret;
> +       skb->xmit_more = 0;
> +       return ops->ndo_start_xmit(skb, dev);
>  }
>
>  static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 18ddf96..9b3802a 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -452,6 +452,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
>   *     @tc_verd: traffic control verdict
>   *     @hash: the packet hash
>   *     @queue_mapping: Queue mapping for multiqueue devices
> + *     @xmit_more: More SKBs are pending for this queue
>   *     @ndisc_nodetype: router type (from link layer)
>   *     @ooo_okay: allow the mapping of a socket to a queue to be changed
>   *     @l4_hash: indicate hash is a canonical 4-tuple hash over transport
> @@ -558,6 +559,7 @@ struct sk_buff {
>
>         __u16                   queue_mapping;
>         kmemcheck_bitfield_begin(flags2);
> +       __u8                    xmit_more:1;
>  #ifdef CONFIG_IPV6_NDISC_NODETYPE
>         __u8                    ndisc_nodetype:2;
>  #endif
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ