netdev - Re: [net-next PATCH v3 3/5] net: Enable Tx queue selection based on Rx queues

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:   Wed, 6 Jun 2018 15:52:04 -0700
From:   "Nambiar, Amritha" <amritha.nambiar@...el.com>
To:     Tom Herbert <tom@...bertland.com>
Cc:     Linux Kernel Network Developers <netdev@...r.kernel.org>,
        "David S. Miller" <davem@...emloft.net>,
        Alexander Duyck <alexander.h.duyck@...el.com>,
        Willem de Bruijn <willemdebruijn.kernel@...il.com>,
        Sridhar Samudrala <sridhar.samudrala@...el.com>,
        Eric Dumazet <edumazet@...gle.com>,
        Hannes Frederic Sowa <hannes@...essinduktion.org>
Subject: Re: [net-next PATCH v3 3/5] net: Enable Tx queue selection based on
 Rx queues

On 6/5/2018 10:57 AM, Tom Herbert wrote:
> 
> 
> On Tue, Jun 5, 2018 at 1:38 AM, Amritha Nambiar
> <amritha.nambiar@...el.com <mailto:amritha.nambiar@...el.com>> wrote:
> 
>     This patch adds support to pick Tx queue based on the Rx queue(s) map
>     configuration set by the admin through the sysfs attribute
>     for each Tx queue. If the user configuration for receive queue(s) map
>     does not apply, then the Tx queue selection falls back to CPU(s) map
>     based selection and finally to hashing.
> 
>     Signed-off-by: Amritha Nambiar <amritha.nambiar@...el.com
>     <mailto:amritha.nambiar@...el.com>>
>     Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com
>     <mailto:sridhar.samudrala@...el.com>>
>     ---
>      include/net/busy_poll.h |    3 ++
>      include/net/sock.h      |   14 +++++++++++
>      net/core/dev.c          |   60
>     ++++++++++++++++++++++++++++++++---------------
>      net/core/sock.c         |    4 +++
>      net/ipv4/tcp_input.c    |    3 ++
>      5 files changed, 65 insertions(+), 19 deletions(-)
> 
>     diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
>     index 71c72a9..fc4fb68 100644
>     --- a/include/net/busy_poll.h
>     +++ b/include/net/busy_poll.h
>     @@ -136,6 +136,9 @@ static inline void sk_mark_napi_id(struct sock
>     *sk, const struct sk_buff *skb)
>      #ifdef CONFIG_NET_RX_BUSY_POLL
>             sk->sk_napi_id = skb->napi_id;
>      #endif
>     +#ifdef CONFIG_XPS
>     +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>     +#endif
>      }
> 
>      /* variant used for unconnected sockets */
>     diff --git a/include/net/sock.h b/include/net/sock.h
>     index 4f7c584..12313653 100644
>     --- a/include/net/sock.h
>     +++ b/include/net/sock.h
>     @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
>       *     @skc_node: main hash linkage for various protocol lookup tables
>       *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>       *     @skc_tx_queue_mapping: tx queue number for this connection
>     + *     @skc_rx_queue_mapping: rx queue number for this connection
>       *     @skc_flags: place holder for sk_flags
>       *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>       *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>     @@ -215,6 +216,9 @@ struct sock_common {
>                     struct hlist_nulls_node skc_nulls_node;
>             };
>             int                     skc_tx_queue_mapping;
>     +#ifdef CONFIG_XPS
>     +       int                     skc_rx_queue_mapping;
> 
> 
> This is still expensive cost to be adding an int field into sock_common
> for a relatively rare use case. Maybe there should be a CONFIG_XPS_RQS?
> Or maybe skc_tx_queue_mapping and skc_rx_queue_mapping could be shorts
> (so maximum queue mapping would then be 2^16-2).

Thanks for the review, Tom. I will fix up the code incorporating all
your feedback in the next version (v4). I could have a new config option
CONFIG_XPS_RXQS that would be default off, in addition to the CONFIG_XPS
option that's already there. With changing the 'skc_tx_queue_mapping' to
short, my concern is that the change would become extensive, there are a
lot of places where this gets filled with int or u32 values.

> 
>     +#endif
>             union {
>                     int             skc_incoming_cpu;
>                     u32             skc_rcv_wnd;
>     @@ -326,6 +330,9 @@ struct sock {
>      #define sk_nulls_node          __sk_common.skc_nulls_node
>      #define sk_refcnt              __sk_common.skc_refcnt
>      #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>     +#ifdef CONFIG_XPS
>     +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>     +#endif
> 
>      #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>      #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>     @@ -1696,6 +1703,13 @@ static inline int sk_tx_queue_get(const
>     struct sock *sk)
>             return sk ? sk->sk_tx_queue_mapping : -1;
>      }
> 
>     +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
>     *skb)
>     +{
>     +#ifdef CONFIG_XPS
>     +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>     +#endif
>     +}
>     +
>      static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>      {
>             sk_tx_queue_clear(sk);
>     diff --git a/net/core/dev.c b/net/core/dev.c
>     index bba755f..1880e6c 100644
>     --- a/net/core/dev.c
>     +++ b/net/core/dev.c
>     @@ -3479,36 +3479,58 @@ sch_handle_egress(struct sk_buff *skb, int
>     *ret, struct net_device *dev)
>      }
>      #endif /* CONFIG_NET_EGRESS */
> 
>     -static inline int get_xps_queue(struct net_device *dev, struct
>     sk_buff *skb)
>     +#ifdef CONFIG_XPS
>     +static int __get_xps_queue_idx(struct net_device *dev, struct
>     sk_buff *skb,
>     +                              struct xps_dev_maps *dev_maps,
>     unsigned int tci)
>     +{
>     +       struct xps_map *map;
>     +       int queue_index = -1;
>     +
>     +       if (dev->num_tc) {
>     +               tci *= dev->num_tc;
>     +               tci += netdev_get_prio_tc_map(dev, skb->priority);
>     +       }
>     +
>     +       map = rcu_dereference(dev_maps->attr_map[tci]);
>     +       if (map) {
>     +               if (map->len == 1)
>     +                       queue_index = map->queues[0];
>     +               else
>     +                       queue_index = map->queues[reciprocal_scale(
>     +                                               skb_get_hash(skb),
>     map->len)];
>     +               if (unlikely(queue_index >= dev->real_num_tx_queues))
>     +                       queue_index = -1;
>     +       }
>     +       return queue_index;
>     +}
>     +#endif
>     +
>     +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>      {
>      #ifdef CONFIG_XPS
>             struct xps_dev_maps *dev_maps;
>     -       struct xps_map *map;
>     +       struct sock *sk = skb->sk;
>             int queue_index = -1;
>     +       unsigned int tci = 0;
> 
>             if (!static_key_false(&xps_needed))
>                     return -1;
> 
>     +       if (sk && sk->sk_rx_queue_mapping <= dev->num_rx_queues)
>     +               tci = sk->sk_rx_queue_mapping;
> 
> 
> This is only be needed if xps_rxqs_map is not null so it should be in
> the block below.
>  
> 
>     +
>             rcu_read_lock();
>     -       dev_maps = rcu_dereference(dev->xps_cpus_map);
>     -       if (dev_maps) {
>     -               unsigned int tci = skb->sender_cpu - 1;
>     +       dev_maps = rcu_dereference(dev->xps_rxqs_map);
>     +       if (dev_maps)
>     +               queue_index = __get_xps_queue_idx(dev, skb,
>     dev_maps, tci);
> 
>     -               if (dev->num_tc) {
>     -                       tci *= dev->num_tc;
>     -                       tci += netdev_get_prio_tc_map(dev,
>     skb->priority);
>     -               }
> 
>     -               map = rcu_dereference(dev_maps->attr_map[tci]);
>     -               if (map) {
>     -                       if (map->len == 1)
>     -                               queue_index = map->queues[0];
>     -                       else
>     -                               queue_index =
>     map->queues[reciprocal_scale(skb_get_hash(skb),
>     -                                                                   
>           map->len)];
>     -                       if (unlikely(queue_index >=
>     dev->real_num_tx_queues))
>     -                               queue_index = -1;
>     -               }
>     +       if (queue_index < 0) {
>     +               tci = skb->sender_cpu - 1;
>     +               dev_maps = rcu_dereference(dev->xps_cpus_map);
>     +               if (dev_maps)
>     +                       queue_index = __get_xps_queue_idx(dev, skb,
>     dev_maps,
>     +                                                         tci);
>             }
>             rcu_read_unlock();
> 
>     diff --git a/net/core/sock.c b/net/core/sock.c
>     index 435a0ba..3c10d31 100644
>     --- a/net/core/sock.c
>     +++ b/net/core/sock.c
>     @@ -2824,6 +2824,10 @@ void sock_init_data(struct socket *sock,
>     struct sock *sk)
>             sk->sk_pacing_rate = ~0U;
>             sk->sk_pacing_shift = 10;
>             sk->sk_incoming_cpu = -1;
>     +
>     +#ifdef CONFIG_XPS
>     +       sk->sk_rx_queue_mapping = -1;
>     +#endif
>             /*
>              * Before updating sk_refcnt, we must commit prior changes
>     to memory
>              * (Documentation/RCU/rculist_nulls.txt for details)
>     diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>     index d5ffb57..cc69f75 100644
>     --- a/net/ipv4/tcp_input.c
>     +++ b/net/ipv4/tcp_input.c
>     @@ -78,6 +78,7 @@
>      #include <linux/errqueue.h>
>      #include <trace/events/tcp.h>
>      #include <linux/static_key.h>
>     +#include <net/busy_poll.h>
> 
>      int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
> 
>     @@ -5574,6 +5575,7 @@ void tcp_finish_connect(struct sock *sk,
>     struct sk_buff *skb)
>             if (skb) {
>                     icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
>                     security_inet_conn_established(sk, skb);
>     +               sk_mark_napi_id(sk, skb);
>             }
> 
>             tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
>     @@ -6402,6 +6404,7 @@ int tcp_conn_request(struct request_sock_ops
>     *rsk_ops,
>             tcp_rsk(req)->snt_isn = isn;
>             tcp_rsk(req)->txhash = net_tx_rndhash();
>             tcp_openreq_init_rwin(req, sk, dst);
>     +       sk_mark_rx_queue(req_to_sk(req), skb);
>             if (!want_cookie) {
>                     tcp_reqsk_record_syn(sk, req, skb);
>                     fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
> 
>