netdev - Re: [PATCH RFC 4/4] xfs: Transmit flow steering

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKgT0UcTS=idZkwcMtLvqSJThjE8kAkAnkqhF1mzG2cJ_OcLdA@mail.gmail.com>
Date:   Tue, 30 Aug 2016 20:14:48 -0700
From:   Alexander Duyck <alexander.duyck@...il.com>
To:     Tom Herbert <tom@...bertland.com>
Cc:     David Miller <davem@...emloft.net>,
        Netdev <netdev@...r.kernel.org>,
        Rick Jones <rick.jones2@....com>,
        Kernel Team <kernel-team@...com>
Subject: Re: [PATCH RFC 4/4] xfs: Transmit flow steering

On Tue, Aug 30, 2016 at 5:00 PM, Tom Herbert <tom@...bertland.com> wrote:
> XFS maintains a per device flow table that is indexed by the skbuff
> hash. The XFS table is only consulted when there is no queue saved in
> a transmit socket for an skbuff.
>
> Each entry in the flow table contains a queue index and a queue
> pointer. The queue pointer is set when a queue is chosen using a
> flow table entry. This pointer is set to the head pointer in the
> transmit queue (which is maintained by BQL).
>
> The new function get_xfs_index that looks up flows in the XPS table.
> The entry returned gives the last queue a matching flow used. The
> returned queue is compared against the normal XPS queue. If they
> are different, then we only switch if the tail pointer in the TX
> queue has advanced past the pointer saved in the entry. In this
> way OOO should be avoided when XPS wants to use a different queue.
>
> Signed-off-by: Tom Herbert <tom@...bertland.com>

This looks pretty good.  I haven't had a chance to test it though as
it will probably take me a few days.

A few minor items called out below.

Thanks.

- Alex

> ---
>  net/Kconfig    |  6 ++++
>  net/core/dev.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++----------
>  2 files changed, 84 insertions(+), 15 deletions(-)
>
> diff --git a/net/Kconfig b/net/Kconfig
> index 7b6cd34..5e3eddf 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -255,6 +255,12 @@ config XPS
>         depends on SMP
>         default y
>
> +config XFS
> +       bool
> +       depends on XPS
> +       depends on BQL
> +       default y
> +
>  config HWBM
>         bool
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1d5c6dd..722e487 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3210,6 +3210,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>  }
>  #endif /* CONFIG_NET_EGRESS */
>
> +/* Must be called with RCU read_lock */
>  static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>  {
>  #ifdef CONFIG_XPS
> @@ -3217,7 +3218,6 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>         struct xps_map *map;
>         int queue_index = -1;
>
> -       rcu_read_lock();
>         dev_maps = rcu_dereference(dev->xps_maps);
>         if (dev_maps) {
>                 map = rcu_dereference(
> @@ -3232,7 +3232,6 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>                                 queue_index = -1;
>                 }
>         }
> -       rcu_read_unlock();
>
>         return queue_index;
>  #else
> @@ -3240,26 +3239,90 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>  #endif
>  }
>
> -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
> +/* Must be called with RCU read_lock */
> +static int get_xfs_index(struct net_device *dev, struct sk_buff *skb)
>  {
> -       struct sock *sk = skb->sk;
> -       int queue_index = sk_tx_queue_get(sk);
> +#ifdef CONFIG_XFS
> +       struct xps_dev_flow_table *flow_table;
> +       struct xps_dev_flow ent;
> +       int queue_index;
> +       struct netdev_queue *txq;
> +       u32 hash;
>
> -       if (queue_index < 0 || skb->ooo_okay ||
> -           queue_index >= dev->real_num_tx_queues) {
> -               int new_index = get_xps_queue(dev, skb);
> -               if (new_index < 0)
> -                       new_index = skb_tx_hash(dev, skb);
> +       flow_table = rcu_dereference(dev->xps_flow_table);
> +       if (!flow_table)
> +               return -1;
>
> -               if (queue_index != new_index && sk &&
> -                   sk_fullsock(sk) &&
> -                   rcu_access_pointer(sk->sk_dst_cache))
> -                       sk_tx_queue_set(sk, new_index);
> +       queue_index = get_xps_queue(dev, skb);
> +       if (queue_index < 0)
> +               return -1;

Actually I think this bit here probably needs to fall back to using
skb_tx_hash if you don't get a usable result.  The problem is you
could have a system that is running with a mix of XFS assigned for
some CPUs and just using skb_tx_hash for others.  We shouldn't steal
flows from the ones selected using skb_tx_hash until they have met the
flow transition criteria.

> -               queue_index = new_index;
> +       hash = skb_get_hash(skb);
> +       if (!hash)
> +               return -1;

I'm not sure the !hash test makes any sense.  Isn't 0 a valid hash value?

> +       ent.v64 = flow_table->flows[hash & flow_table->mask].v64;
> +       if (ent.queue_index >= 0 &&
> +           ent.queue_index < dev->real_num_tx_queues) {
> +               txq = netdev_get_tx_queue(dev, ent.queue_index);
> +               if (queue_index != ent.queue_index) {
> +                       if ((int)(txq->tail_cnt - ent.queue_ptr) >= 0)  {
> +                               /* The current queue's tail has advanced
> +                                * beyone the last packet that was

Small typo here, "beyond" instead of "beyone".

> +                                * enqueued using the table entry. All
> +                                * previous packets sent for this flow
> +                                * should have been completed so the
> +                                * queue for the flow cna be changed.
> +                                */
> +                               ent.queue_index = queue_index;
> +                               txq = netdev_get_tx_queue(dev, queue_index);

You might want to consider using a goto label instead of duplicating
this block of code here and in the else section below.

> +                       } else {
> +                               queue_index = ent.queue_index;
> +                       }
> +               }
> +       } else {
> +               /* Queue from the table was bad, use the new one. */
> +               ent.queue_index = queue_index;
> +               txq = netdev_get_tx_queue(dev, queue_index);
>         }
>
> +       /* Save the updated entry */
> +       ent.queue_ptr = txq->head_cnt;
> +       flow_table->flows[hash & flow_table->mask].v64 = ent.v64;
> +
>         return queue_index;
> +#else
> +       return = get_xps_queue(dev, skb);
> +#endif
> +}
> +
> +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
> +{
> +       struct sock *sk = skb->sk;
> +       int queue_index = sk_tx_queue_get(sk);
> +       int new_index;
> +
> +       if (queue_index < 0) {
> +               /* Socket did not provide a queue index, try XFS */
> +               new_index = get_xfs_index(dev, skb);
> +       } else if (skb->ooo_okay || queue_index >= dev->real_num_tx_queues) {
> +               /* Queue index in socket, see if we can find a better one */
> +               new_index = get_xps_queue(dev, skb);
> +       } else {
> +               /* Valid queue in socket and can't send OOO. Just return it */
> +               return queue_index;
> +       }
> +
> +       if (new_index < 0) {
> +               /* No queue index from flow steering, fallback to hash */
> +               new_index = skb_tx_hash(dev, skb);
> +       }

You could just move the comment above the if statement and drop the braces.

> +
> +       if (queue_index != new_index && sk && sk_fullsock(sk) &&
> +           rcu_access_pointer(sk->sk_dst_cache))
> +               sk_tx_queue_set(sk, new_index);
> +
> +       return new_index;
>  }
>
>  struct netdev_queue *netdev_pick_tx(struct net_device *dev,
> --
> 2.8.0.rc2
>